In [None]:
import pandas as pd
import matplotlib.pyplot as plt

files_U = {
    98: '/content/drive/MyDrive/prj/U98.xlsx',
    99: '/content/drive/MyDrive/prj/U99.xlsx',
    1400: '/content/drive/MyDrive/prj/U1400.xlsx',
    1401: '/content/drive/MyDrive/prj/U1401.xlsx'
}

files_R = {
    98: '/content/drive/MyDrive/prj/R98.xlsx',
    99: '/content/drive/MyDrive/prj/R99.xlsx',
    1400: '/content/drive/MyDrive/prj/R1400.xlsx',
    1401: '/content/drive/MyDrive/prj/R1401.xlsx'
}

columns_to_keep = ['age', 'degree', 'relation', 'occupationalst']
urban_data_all_years = {}
rural_data_all_years = {}
for year, path in files_U.items():
    temp_urban_data = pd.read_excel(path, sheet_name=f'U{year}P1')
    urban_data_all_years[year] = temp_urban_data[columns_to_keep]
for year, path in files_R.items():
    temp_rural_data = pd.read_excel(path, sheet_name=f'R{year}P1')
    rural_data_all_years[year] = temp_rural_data[columns_to_keep]

def fill_missing_data(data_dict):
    for year, data in data_dict.items():
        data['degree'] = data['degree'].fillna('Unknown').astype(str)
        data['relation'] = data['relation'].fillna('Unknown').astype(str)
        data['occupationalst'] = data['occupationalst'].fillna('Unknown').astype(str)
        data['age'] = data['age'].fillna(data['age'].mean())
fill_missing_data(urban_data_all_years)
fill_missing_data(rural_data_all_years)

def plot_separate_distribution(year, data, column, title, xlabel, data_type, bins=None):
    plt.figure(figsize=(10, 6))  # Larger figure for better readability
    plt.hist(data[column], bins=bins if bins else 20, alpha=0.7, color='blue' if data_type == 'Urban' else 'green', edgecolor='black')
    plt.title(f'{title} ({data_type}, Year {year})', fontsize=16)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.tight_layout()
    plt.show()


for year in [98, 99, 1400, 1401]:
    urban_data = urban_data_all_years[year]
    rural_data = rural_data_all_years[year]
    plot_separate_distribution(year, urban_data, 'age', 'Age Distribution', 'Age', 'Urban')
    plot_separate_distribution(year, rural_data, 'age', 'Age Distribution', 'Age', 'Rural')
    plot_separate_distribution(year, urban_data, 'degree', 'Education Level Distribution', 'Education Level', 'Urban')
    plot_separate_distribution(year, rural_data, 'degree', 'Education Level Distribution', 'Education Level', 'Rural')
    plot_separate_distribution(year, urban_data, 'relation', 'Relationship to Head of Household', 'Relation', 'Urban')
    plot_separate_distribution(year, rural_data, 'relation', 'Relationship to Head of Household', 'Relation', 'Rural')
    plot_separate_distribution(year, urban_data, 'occupationalst', 'Employment Status Distribution', 'Employment Status', 'Urban')
    plot_separate_distribution(year, rural_data, 'occupationalst', 'Employment Status Distribution', 'Employment Status', 'Rural')