In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
def load_data_Urbun(sheet):
    df_urban = {}
    years = ['98', '99', '1400', '1401']
    for year in years:
        file_name = f'U{year}.xlsx'
        sheet_name = f'U{year}{sheet}'
        df = pd.read_excel(file_name, sheet_name = sheet_name)
        df_urban[f'U{year}'] = df
    return df_urban

In [3]:
def load_data_Rural(sheet):
    df_Rural = {}
    years = ['98', '99', '1400', '1401']
    for year in years:
        file_name = f'R{year}.xlsx'
        sheet_name = f'R{year}{sheet}'
        df = pd.read_excel(file_name, sheet_name = sheet_name)
        df_Rural[f'R{year}'] = df
    return df_Rural

In [4]:
Rural_Data = load_data_Rural('Data')
Urbun_Data = load_data_Urbun('Data')

In [5]:
Rural_data_income_paid_jobs = load_data_Rural('P4S01')
Urbun_data_income_paid_jobs = load_data_Urbun('P4S01')

In [6]:
Rural_data_income_freelance_jobs = load_data_Rural('P4S02')
Urbun_data_income_freelance_jobs = load_data_Urbun('P4S02')

In [7]:
Rural_data_income_Miscellaneous = load_data_Rural('P4S03')
Urbun_data_income_Miscellaneous = load_data_Urbun('P4S03')

In [8]:
Rural_data_income_subsidy = load_data_Rural('P4S04')
Urbun_data_income_subsidy = load_data_Urbun('P4S04')

In [9]:
Selected_address = {}
for key, value in Rural_Data.items():
    Selected_address[key] = value[value['province'] == 'CharmahalBakhtiari']['Address'].unique()

for key, value in Urbun_Data.items():
    Selected_address[key] = value[value['province'] == 'CharmahalBakhtiari']['Address'].unique()

In [10]:
# Paid jobs
for key, value in Rural_data_income_paid_jobs.items():
    Rural_data_income_paid_jobs[key] = value[value['Address'].isin(Selected_address[key])]
for key, value in Urbun_data_income_paid_jobs.items():
    Urbun_data_income_paid_jobs[key] = value[value['Address'].isin(Selected_address[key])]

# Freelance jobs
for key, value in Rural_data_income_freelance_jobs.items():
    Rural_data_income_freelance_jobs[key] = value[value['Address'].isin(Selected_address[key])]
for key, value in Urbun_data_income_freelance_jobs.items():
    Urbun_data_income_freelance_jobs[key] = value[value['Address'].isin(Selected_address[key])]

# Miscellaneous
for key, value in Rural_data_income_Miscellaneous.items():
    Rural_data_income_Miscellaneous[key] = value[value['Address'].isin(Selected_address[key])]
for key, value in Urbun_data_income_Miscellaneous.items():
    Urbun_data_income_Miscellaneous[key] = value[value['Address'].isin(Selected_address[key])]

# Subsidy
for key, value in Rural_data_income_subsidy.items():
    Rural_data_income_subsidy[key] = value[value['Address'].isin(Selected_address[key])]
for key, value in Urbun_data_income_subsidy.items():
    Urbun_data_income_subsidy[key] = value[value['Address'].isin(Selected_address[key])]

In [56]:
Urbun_data_income_freelance_jobs['U1401']['income_s_y'] = Urbun_data_income_freelance_jobs['U1401']['income_s_y'].astype('float')

In [64]:
Rural_data_income_freelance_jobs['R1401'].loc[1571, 'income_s_y'] = '0'

In [65]:
Rural_data_income_freelance_jobs['R1401']['income_s_y'] = Rural_data_income_freelance_jobs['R1401']['income_s_y'].astype('float')

#### Group By

In [70]:
# Paid Jobs
paid_jobs = {}
for key, value in Rural_data_income_paid_jobs.items():
    paid_jobs[key] = value.groupby('Address')[['netincome_w_y']].agg('sum').reset_index()
for key, value in Urbun_data_income_paid_jobs.items():
    paid_jobs[key] = value.groupby('Address')[['netincome_w_y']].agg('sum').reset_index()
    
# Freelance Jobs
Freelance_Jobs = {}
for key, value in Rural_data_income_freelance_jobs.items() :
    Freelance_Jobs[key] = value.groupby('Address')[['income_s_y']].agg('sum').reset_index()
for key, value in Urbun_data_income_freelance_jobs.items() :
    Freelance_Jobs[key] = value.groupby('Address')[['income_s_y']].agg('sum').reset_index()
    
# Miscellaneous
Miscellaneous = {}
selected_columns = ['income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']
for key in Rural_data_income_Miscellaneous.keys():
    Miscellaneous[key] = Rural_data_income_Miscellaneous[key].groupby('Address')[selected_columns].agg('sum').reset_index()
for key in Urbun_data_income_Miscellaneous.keys():
    Miscellaneous[key] = Urbun_data_income_Miscellaneous[key].groupby('Address')[selected_columns].agg('sum').reset_index()
    
# Subsidy 
Subsidy = {}
for key, value in Rural_data_income_subsidy.items():
    Subsidy[key] = value.groupby('Address')[['subsidy']].agg('sum').reset_index()
for key, value in Urbun_data_income_subsidy.items():
    Subsidy[key] = value.groupby('Address')[['subsidy']].agg('sum').reset_index()

In [71]:
def process_income_data_Urbun(years):
    ans = {}
    for year in years :
        
        df = paid_jobs[f'U{year}'].merge(Freelance_Jobs[f'U{year}'], how = 'outer')

        df = df.merge(Miscellaneous[f'U{year}'], how = 'outer')

        df = df.merge(Subsidy[f'U{year}'], how='outer')
        
        ans[f'number{year}'] = df['Address'].nunique()

        df = df.drop(columns='Address')

        df.dropna(how='all', inplace=True)

        df.fillna(0, inplace=True)

        df = df.astype('float')

        df = df.sum(axis=1)
        
        ans[f'{year}'] = df
        
    return ans

In [73]:
def process_income_data_Rural(years):
    ans = {}
    for year in years :
        
        df = paid_jobs[f'R{year}'].merge(Freelance_Jobs[f'R{year}'], how = 'outer')

        df = df.merge(Miscellaneous[f'R{year}'], how = 'outer')

        df = df.merge(Subsidy[f'R{year}'], how='outer')
        
        ans[f'number{year}'] = df['Address'].nunique()

        df = df.drop(columns='Address')

        df.dropna(how='all', inplace=True)

        df.fillna(0, inplace=True)

        df = df.astype('float')

        df = df.sum(axis=1)
        
        ans[f'{year}'] = df
            
    return ans

In [75]:
years = ['98', '99', '1400', '1401']
Urban_data_income = process_income_data_Urbun(years)
Rural_data_income = process_income_data_Rural(years)

### فرضیات

- **H0**: درآمد خانوارهای شهری و روستایی در استان چهارمحال و بختیاری با هم برابر است
- **H1**: درآمد خانوارهای شهری و روستایی در استان چهارمحال و بختیاری با هم برابر نیست


#### 98

In [84]:
shapiro_R_98 = stats.shapiro(Rural_data_income['98'])
shapiro_U_98 = stats.shapiro(Urban_data_income['98'])
shapiro_R_98, shapiro_U_98

(ShapiroResult(statistic=0.8730337948969712, pvalue=3.10940011587708e-19),
 ShapiroResult(statistic=0.08303455283550876, pvalue=5.518560505472422e-48))

In [88]:
R_98_yeojohnson , _ = stats.yeojohnson(Rural_data_income['98'])
U_98_yeojohnson , _ = stats.yeojohnson(Urban_data_income['98'])
shapiro_R_98_yeojohnson = stats.shapiro(R_98_yeojohnson)
shapiro_U_98_yeojohnson = stats.shapiro(U_98_yeojohnson)
shapiro_R_98_yeojohnson, shapiro_U_98_yeojohnson

(ShapiroResult(statistic=0.888689671015572, pvalue=5.064260624511144e-18),
 ShapiroResult(statistic=0.11026887102335003, pvalue=1.6021873947220456e-47))

In [89]:
stats.mannwhitneyu(Rural_data_income['98'], Urban_data_income['98'])

MannwhitneyuResult(statistic=107075.5, pvalue=4.203094045980122e-18)

#### 99

In [90]:
shapiro_R_99 = stats.shapiro(Rural_data_income['99'])
shapiro_U_99 = stats.shapiro(Urban_data_income['99'])
shapiro_R_99, shapiro_U_99

(ShapiroResult(statistic=0.8297011430016313, pvalue=2.6830957068042534e-22),
 ShapiroResult(statistic=0.01733324147096371, pvalue=1.0227056962103966e-48))

In [92]:
R_99_yeojohnson , _ = stats.yeojohnson(Rural_data_income['99'])
U_99_yeojohnson , _ = stats.yeojohnson(Urban_data_income['99'])
shapiro_R_99_yeojohnson = stats.shapiro(R_99_yeojohnson)
shapiro_U_99_yeojohnson = stats.shapiro(U_99_yeojohnson)
shapiro_R_99_yeojohnson, shapiro_U_99_yeojohnson

(ShapiroResult(statistic=0.9886746647987821, pvalue=0.0008577578428189212),
 ShapiroResult(statistic=0.03463710859781999, pvalue=1.926253387271954e-48))

In [94]:
stats.mannwhitneyu(Rural_data_income['99'], Urban_data_income['99'])

MannwhitneyuResult(statistic=112599.0, pvalue=1.917304286008066e-14)

#### 1400

In [95]:
shapiro_R_1400 = stats.shapiro(Rural_data_income['1400'])
shapiro_U_1400 = stats.shapiro(Urban_data_income['1400'])
shapiro_R_1400, shapiro_U_1400

(ShapiroResult(statistic=0.02143228719661272, pvalue=3.990699701187746e-44),
 ShapiroResult(statistic=0.06639980169534376, pvalue=2.4072739780776472e-48))

In [97]:
R_1400_yeojohnson , _ = stats.yeojohnson(Rural_data_income['1400'])
U_1400_yeojohnson , _ = stats.yeojohnson(Urban_data_income['1400'])
shapiro_R_1400_yeojohnson = stats.shapiro(R_1400_yeojohnson)
shapiro_U_1400_yeojohnson = stats.shapiro(U_1400_yeojohnson)
shapiro_R_1400_yeojohnson, shapiro_U_1400_yeojohnson

(ShapiroResult(statistic=0.9058046090529781, pvalue=7.202925244967938e-17),
 ShapiroResult(statistic=0.8665640203381125, pvalue=4.1059381344034984e-23))

In [99]:
stats.mannwhitneyu(Rural_data_income['1400'], Urban_data_income['1400'])

MannwhitneyuResult(statistic=121282.5, pvalue=2.863802198359207e-12)

#### 1401

In [100]:
shapiro_R_1401 = stats.shapiro(Rural_data_income['1401'])
shapiro_U_1401 = stats.shapiro(Urban_data_income['1401'])
shapiro_R_1401, shapiro_U_1401

(ShapiroResult(statistic=0.8624497765713363, pvalue=2.855842062959131e-20),
 ShapiroResult(statistic=0.8955742359464116, pvalue=1.0267914922423725e-20))

In [102]:
R_1401_yeojohnson , _ = stats.yeojohnson(Rural_data_income['1401'])
U_1401_yeojohnson , _ = stats.yeojohnson(Urban_data_income['1401'])
shapiro_R_1401_yeojohnson = stats.shapiro(R_1401_yeojohnson)
shapiro_U_1401_yeojohnson = stats.shapiro(U_1401_yeojohnson)
shapiro_R_1401_yeojohnson, shapiro_U_1401_yeojohnson

(ShapiroResult(statistic=0.9897078174328108, pvalue=0.0017231732480825481),
 ShapiroResult(statistic=0.9925084289817818, pvalue=0.0021912123017287194))

In [103]:
stats.mannwhitneyu(Rural_data_income['1401'], Urban_data_income['1401'])

MannwhitneyuResult(statistic=122594.0, pvalue=1.3230455008288734e-11)

#### Total

In [130]:
Max_rural = max(Rural_data_income['number98'], Rural_data_income['number99'], Rural_data_income['number1400'], Rural_data_income['number1401'])
Max_urban = max(Urban_data_income['number98'], Urban_data_income['number99'], Urban_data_income['number1400'], Urban_data_income['number1401'])

In [156]:
Total_number_Rural = Rural_data_income['number98'] + Rural_data_income['number99'] + Rural_data_income['number1400'] + Rural_data_income['number1401']
Total_Rural = ((Rural_data_income['98'] * Rural_data_income['number98']).reindex(range(Max_rural), fill_value=0) + \
    (Rural_data_income['99'] * Rural_data_income['number99']).reindex(range(Max_rural), fill_value=0) + \
    (Rural_data_income['1400'] * Rural_data_income['number1400']).reindex(range(Max_rural), fill_value=0) + \
    (Rural_data_income['1401'] * Rural_data_income['number1401']).reindex(range(Max_rural), fill_value=0)) / Total_number_Rural

In [155]:
Total_number_Urban = Urban_data_income['number98'] + Urban_data_income['number99'] + Urban_data_income['number1400'] + Urban_data_income['number1401']
Total_Urban = ((Urban_data_income['98'] * Urban_data_income['number98']).reindex(range(Max_urban), fill_value=0) + \
    (Urban_data_income['99'] * Urban_data_income['number99']).reindex(range(Max_urban), fill_value=0) + \
    (Urban_data_income['1400'] * Urban_data_income['number1400']).reindex(range(Max_urban), fill_value=0) + \
    (Urban_data_income['1401'] * Urban_data_income['number1401']).reindex(range(Max_urban), fill_value=0)) / Total_number_Rural

In [158]:
shapiro_Total_Rural = stats.shapiro(Total_Rural)
shapiro_Total_Urban = stats.shapiro(Total_Urban)
shapiro_Total_Rural, shapiro_Total_Urban

(ShapiroResult(statistic=0.021432289109404534, pvalue=3.9906999626435666e-44),
 ShapiroResult(statistic=0.016875153804942977, pvalue=2.5828169570580004e-49))

In [161]:
R_Total_yeojohnson , _ = stats.yeojohnson(Total_Rural)
U_Total_yeojohnson , _ = stats.yeojohnson(Total_Urban)
shapiro_R_Total_yeojohnson = stats.shapiro(R_Total_yeojohnson)
shapiro_U_Total_yeojohnson = stats.shapiro(U_Total_yeojohnson)
shapiro_R_Total_yeojohnson, shapiro_U_Total_yeojohnson

(ShapiroResult(statistic=0.9363347933081786, pvalue=1.2358352039299714e-13),
 ShapiroResult(statistic=0.8022620221147078, pvalue=1.4949787500890964e-27))

In [163]:
stats.mannwhitneyu(Total_Rural, Total_Urban)

MannwhitneyuResult(statistic=50155.0, pvalue=5.822205629175404e-89)

با توجه به نتایج میشه با درصد اطمینان بالایی گفت که درامد این دو گروه با هم برابر چه در به تفکیک سال و چه در طول 4 سال برابر نیست