In [1]:
import pandas as pd
from scipy.stats import mannwhitneyu
import scipy.stats as stats

In [2]:
ALPHA = 0.05

cols_to_sum = ['netincome_w_y', 'perk_w_y', 'wage_w_y', 'income_s_y', 'income_pension', 'income_rent',
               'income_interest', 'income_aid', 'income_resale', 'income_transfer', 'subsidy']

def anderson(result):
    print(f"Statistic: {result.statistic}")
    print(f"Critical Values: {result.critical_values}")
    print(f"Significance Levels: {result.significance_level}")

    for i in range(len(result.critical_values)):
        if result.statistic > result.critical_values[i]:
            print(f"At the {result.significance_level[i]}% significance level, data is NOT normal.")
        else:
            print(f"At the {result.significance_level[i]}% significance level, data is normal.")

# 1398

In [3]:
U98P1 = pd.read_excel('../datasets/U98.xlsx', sheet_name='U98P1')
U98P4S01 = pd.read_excel('../datasets/U98.xlsx', sheet_name='U98P4S01')
U98P4S02 = pd.read_excel('../datasets/U98.xlsx', sheet_name='U98P4S02')
U98P4S03 = pd.read_excel('../datasets/U98.xlsx', sheet_name='U98P4S03')
U98P4S04 = pd.read_excel('../datasets/U98.xlsx', sheet_name='U98P4S04')

R98P1 = pd.read_excel('../datasets/R98.xlsx', sheet_name='R98P1')
R98P4S01 = pd.read_excel('../datasets/R98.xlsx', sheet_name='R98P4S01')
R98P4S02 = pd.read_excel('../datasets/R98.xlsx', sheet_name='R98P4S02')
R98P4S03 = pd.read_excel('../datasets/R98.xlsx', sheet_name='R98P4S03')
R98P4S04 = pd.read_excel('../datasets/R98.xlsx', sheet_name='R98P4S04')

In [4]:
Y98P1 = pd.concat([U98P1, R98P1])
Y98P4S01 = pd.concat([U98P4S01, R98P4S01])
Y98P4S02 = pd.concat([U98P4S02, R98P4S02])
Y98P4S03 = pd.concat([U98P4S03, R98P4S03])
Y98P4S04 = pd.concat([U98P4S04, R98P4S04])

Y98P1['degree'] = Y98P1['degree'].str.lower()

Y98P1['degree'].value_counts()

Y98_graduates = Y98P1[Y98P1['degree'].isin(['phd', 'master', 'bachelor', 'college'])]
Y98_non_graduates = Y98P1[Y98P1['degree'].isin(['elementary', 'diploma', 'secondary', 'highschool'])]

print(f"Number of graduates in urban: {Y98_graduates.shape[0]}")
print(f"Number of non-graduates in urban: {Y98_non_graduates.shape[0]}")

Number of graduates in urban: 15592
Number of non-graduates in urban: 44997


In [5]:
Y98P4S01 = Y98P4S01[['Address', 'netincome_w_y', 'perk_w_y', 'wage_w_y']]
Y98P4S02 = Y98P4S02[['Address', 'income_s_y']]
Y98P4S03 = Y98P4S03[
    ['Address', 'income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']]
Y98P4S04 = Y98P4S04[['Address', 'subsidy']]

Y98_incomes = pd.merge(left=Y98P4S01, right=Y98P4S02, on='Address', how='outer')
Y98_incomes = pd.merge(left=Y98_incomes, right=Y98P4S03, on='Address', how='outer')
Y98_incomes = pd.merge(left=Y98_incomes, right=Y98P4S04, on='Address', how='outer')

In [6]:
Y98_incomes[cols_to_sum] = Y98_incomes[cols_to_sum].apply(pd.to_numeric, errors='coerce')
Y98_incomes = Y98_incomes.dropna(thresh=Y98_incomes.shape[1] - 10)
Y98_incomes = Y98_incomes.fillna(0)

Y98_graduates_income = pd.merge(left=Y98_graduates, right=Y98_incomes, on='Address', how='inner')
Y98_non_graduates_income = pd.merge(left=Y98_non_graduates, right=Y98_incomes, on='Address', how='inner')

Y98_graduates_income['total_income'] = Y98_graduates_income[cols_to_sum].sum(axis=1)
Y98_non_graduates_income['total_income'] = Y98_non_graduates_income[cols_to_sum].sum(axis=1)

In [7]:
Y98_graduates_income_mean = Y98_graduates_income['total_income'].mean()

In [8]:
Y98_non_graduates_income_mean = Y98_non_graduates_income['total_income'].mean()

فرضیات
- **فرضیه صفر (H0):** تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود ندارد.
- **فرضیه مقابل (H1):** تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود دارد.


In [9]:
Y98_graduates_income.drop(['degree'], axis=1, inplace=True)
Y98_non_graduates_income.drop(['degree'], axis=1, inplace=True)

Y98_graduates_income_anderson = stats.anderson(Y98_graduates_income['total_income'].to_numpy())

anderson(Y98_graduates_income_anderson)

Y98_non_graduates_income_anderson = stats.anderson(Y98_non_graduates_income['total_income'].to_numpy())

anderson(Y98_non_graduates_income_anderson)

u_statistic, p_value = mannwhitneyu(Y98_graduates_income['total_income'], Y98_graduates_income['total_income'],
                                    alternative='two-sided')

print(f"U-statistic: {u_statistic} {p_value}")
print(
    f"P-value: {p_value} so there's {'no' if p_value > ALPHA else 'an'} evidence about that graduated people earn more")

Statistic: 694.0000047112844
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 2848.8719718521606
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
U-statistic: 526079484.5 1.0
P-value: 1.0 so there's no evidence about that graduated people earn more


In [10]:
print(f"Difference between average: {Y98_graduates_income_mean - Y98_non_graduates_income_mean:,.2f}")

Difference between average: 159,078,475.13


In [11]:
Y98_graduates_income_yeojohnson = stats.yeojohnson(Y98_graduates_income['total_income'])
Y98_non_graduates_income_yeojohnson = stats.yeojohnson(Y98_graduates_income['total_income'])

Y98_non_graduates_income_anderson = stats.anderson(Y98_graduates_income_yeojohnson[0])
Y98_non_graduates_income_yeojohnson = stats.anderson(Y98_non_graduates_income_yeojohnson[0])

anderson(Y98_non_graduates_income_anderson)
anderson(Y98_non_graduates_income_yeojohnson)

Statistic: 611.0781128209637
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 611.0781128209637
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.


تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود ندارد

# 1399

In [12]:
U99P1 = pd.read_excel('../datasets/U99.xlsx', sheet_name='U99P1')
U99P4S01 = pd.read_excel('../datasets/U99.xlsx', sheet_name='U99P4S01')
U99P4S02 = pd.read_excel('../datasets/U99.xlsx', sheet_name='U99P4S02')
U99P4S03 = pd.read_excel('../datasets/U99.xlsx', sheet_name='U99P4S03')
U99P4S04 = pd.read_excel('../datasets/U99.xlsx', sheet_name='U99P4S04')

R99P1 = pd.read_excel('../datasets/R99.xlsx', sheet_name='R99P1')
R99P4S01 = pd.read_excel('../datasets/R99.xlsx', sheet_name='R99P4S01')
R99P4S02 = pd.read_excel('../datasets/R99.xlsx', sheet_name='R99P4S02')
R99P4S03 = pd.read_excel('../datasets/R99.xlsx', sheet_name='R99P4S03')
R99P4S04 = pd.read_excel('../datasets/R99.xlsx', sheet_name='R99P4S04')

In [13]:
Y99P1 = pd.concat([U99P1, R99P1])
Y99P4S01 = pd.concat([U99P4S01, R99P4S01])
Y99P4S02 = pd.concat([U99P4S02, R99P4S02])
Y99P4S03 = pd.concat([U99P4S03, R99P4S03])
Y99P4S04 = pd.concat([U99P4S04, R99P4S04])

Y99P1['degree'] = Y99P1['degree'].str.lower()

Y99P1['degree'].value_counts()

Y99_graduates = Y99P1[Y99P1['degree'].isin(['phd', 'master', 'bachelor', 'college'])]
Y99_non_graduates = Y99P1[Y99P1['degree'].isin(['elementary', 'diploma', 'secondary', 'highschool'])]

print(f"Number of graduates in urban: {Y99_graduates.shape[0]}")
print(f"Number of non-graduates in urban: {Y99_non_graduates.shape[0]}")

Number of graduates in urban: 15654
Number of non-graduates in urban: 44961


In [14]:
Y99P4S01 = Y99P4S01[['Address', 'netincome_w_y', 'perk_w_y', 'wage_w_y']]
Y99P4S02 = Y99P4S02[['Address', 'income_s_y']]
Y99P4S03 = Y99P4S03[
    ['Address', 'income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']]
Y99P4S04 = Y99P4S04[['Address', 'subsidy']]

Y99_incomes = pd.merge(left=Y99P4S01, right=Y99P4S02, on='Address', how='outer')
Y99_incomes = pd.merge(left=Y99_incomes, right=Y99P4S03, on='Address', how='outer')
Y99_incomes = pd.merge(left=Y99_incomes, right=Y99P4S04, on='Address', how='outer')

In [15]:
Y99_incomes[cols_to_sum] = Y99_incomes[cols_to_sum].apply(pd.to_numeric, errors='coerce')
Y99_incomes = Y99_incomes.dropna(thresh=Y99_incomes.shape[1] - 10)
Y99_incomes = Y99_incomes.fillna(0)

Y99_graduates_income = pd.merge(left=Y99_graduates, right=Y99_incomes, on='Address', how='inner')
Y99_non_graduates_income = pd.merge(left=Y99_non_graduates, right=Y99_incomes, on='Address', how='inner')

Y99_graduates_income['total_income'] = Y99_graduates_income[cols_to_sum].sum(axis=1)
Y99_non_graduates_income['total_income'] = Y99_non_graduates_income[cols_to_sum].sum(axis=1)

In [16]:
Y99_graduates_income_mean = Y99_graduates_income['total_income'].mean()

In [17]:
Y99_non_graduates_income_mean = Y99_non_graduates_income['total_income'].mean()

In [18]:
print(f"Difference between average: {Y99_graduates_income_mean - Y99_non_graduates_income_mean:,.2f}")

Difference between average: 207,787,299.31


In [19]:
Y99_graduates_income.drop(['degree'], axis=1, inplace=True)
Y99_non_graduates_income.drop(['degree'], axis=1, inplace=True)

Y99_graduates_income_anderson = stats.anderson(Y99_graduates_income['total_income'].to_numpy())

anderson(Y99_graduates_income_anderson)

Y99_non_graduates_income_anderson = stats.anderson(Y99_non_graduates_income['total_income'].to_numpy())

anderson(Y99_non_graduates_income_anderson)

u_statistic, p_value = mannwhitneyu(Y99_graduates_income['total_income'], Y99_non_graduates_income['total_income'],
                                    alternative='two-sided')

print(f"U-statistic: {u_statistic} {p_value}")
print(
    f"P-value: {p_value} so there's {'no' if p_value > ALPHA else 'an'} evidence about that graduated people earn more")

Statistic: 1356.5185398020185
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 4735.396951654111
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
U-statistic: 5532220091.0 0.0
P-value: 0.0 so there's an evidence about that graduated people earn more


In [20]:
Y99_graduates_income_yeojohnson = stats.yeojohnson(Y99_graduates_income['total_income'])
Y99_non_graduates_income_yeojohnson = stats.yeojohnson(Y99_graduates_income['total_income'])

Y99_non_graduates_income_anderson = stats.anderson(Y99_graduates_income_yeojohnson[0])
Y99_non_graduates_income_yeojohnson = stats.anderson(Y99_non_graduates_income_yeojohnson[0])

anderson(Y99_non_graduates_income_anderson)
anderson(Y99_non_graduates_income_yeojohnson)

Statistic: 1063.9297403523815
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 1063.9297403523815
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.


تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود دارد

# 1400

In [21]:
U1400P1 = pd.read_excel('../datasets/U1400.xlsx', sheet_name='U1400P1')
U1400P4S01 = pd.read_excel('../datasets/U1400.xlsx', sheet_name='U1400P4S01')
U1400P4S02 = pd.read_excel('../datasets/U1400.xlsx', sheet_name='U1400P4S02')
U1400P4S03 = pd.read_excel('../datasets/U1400.xlsx', sheet_name='U1400P4S03')
U1400P4S04 = pd.read_excel('../datasets/U1400.xlsx', sheet_name='U1400P4S04')

R1400P1 = pd.read_excel('../datasets/R1400.xlsx', sheet_name='R1400P1')
R1400P4S01 = pd.read_excel('../datasets/R1400.xlsx', sheet_name='R1400P4S01')
R1400P4S02 = pd.read_excel('../datasets/R1400.xlsx', sheet_name='R1400P4S02')
R1400P4S03 = pd.read_excel('../datasets/R1400.xlsx', sheet_name='R1400P4S03')
R1400P4S04 = pd.read_excel('../datasets/R1400.xlsx', sheet_name='R1400P4S04')

In [22]:
Y1400P1 = pd.concat([U1400P1, R1400P1])
Y1400P4S01 = pd.concat([U1400P4S01, R1400P4S01])
Y1400P4S02 = pd.concat([U1400P4S02, R1400P4S02])
Y1400P4S03 = pd.concat([U1400P4S03, R1400P4S03])
Y1400P4S04 = pd.concat([U1400P4S04, R1400P4S04])

Y1400P1['degree'] = Y1400P1['degree'].str.lower()

Y1400P1['degree'].value_counts()

Y1400_graduates = Y1400P1[Y1400P1['degree'].isin(['phd', 'master', 'bachelor', 'college'])]
Y1400_non_graduates = Y1400P1[Y1400P1['degree'].isin(['elementary', 'diploma', 'secondary', 'highschool'])]

print(f"Number of graduates in urban: {Y1400_graduates.shape[0]}")
print(f"Number of non-graduates in urban: {Y1400_non_graduates.shape[0]}")

Number of graduates in urban: 16059
Number of non-graduates in urban: 45754


In [23]:
Y1400P4S01 = Y1400P4S01[['Address', 'netincome_w_y', 'perk_w_y', 'wage_w_y']]
Y1400P4S02 = Y1400P4S02[['Address', 'income_s_y']]
Y1400P4S03 = Y1400P4S03[
    ['Address', 'income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']]
Y1400P4S04 = Y1400P4S04[['Address', 'subsidy']]

Y1400_incomes = pd.merge(left=Y1400P4S01, right=Y1400P4S02, on='Address', how='outer')
Y1400_incomes = pd.merge(left=Y1400_incomes, right=Y1400P4S03, on='Address', how='outer')
Y1400_incomes = pd.merge(left=Y1400_incomes, right=Y1400P4S04, on='Address', how='outer')

In [24]:
Y1400_incomes[cols_to_sum] = Y1400_incomes[cols_to_sum].apply(pd.to_numeric, errors='coerce')
Y1400_incomes = Y1400_incomes.dropna(thresh=Y1400_incomes.shape[1] - 10)
Y1400_incomes = Y1400_incomes.fillna(0)

Y1400_graduates_income = pd.merge(left=Y1400_graduates, right=Y1400_incomes, on='Address', how='inner')
Y1400_non_graduates_income = pd.merge(left=Y1400_non_graduates, right=Y1400_incomes, on='Address', how='inner')

Y1400_graduates_income['total_income'] = Y1400_graduates_income[cols_to_sum].sum(axis=1)
Y1400_non_graduates_income['total_income'] = Y1400_non_graduates_income[cols_to_sum].sum(axis=1)

In [25]:
Y1400_graduates_income_mean = Y1400_graduates_income['total_income'].mean()

In [26]:
Y1400_non_graduates_income_mean = Y1400_non_graduates_income['total_income'].mean()

In [27]:
print(f"Difference between average: {Y1400_graduates_income_mean - Y1400_non_graduates_income_mean:,.2f}")

Difference between average: 256,663,772.88


In [28]:
Y1400_graduates_income.drop(['degree'], axis=1, inplace=True)
Y1400_non_graduates_income.drop(['degree'], axis=1, inplace=True)

Y1400_graduates_income_anderson = stats.anderson(Y1400_graduates_income['total_income'].to_numpy())

anderson(Y1400_graduates_income_anderson)

Y1400_non_graduates_income_anderson = stats.anderson(Y1400_non_graduates_income['total_income'].to_numpy())

anderson(Y1400_non_graduates_income_anderson)

u_statistic, p_value = mannwhitneyu(Y1400_graduates_income['total_income'], Y1400_non_graduates_income['total_income'],
                                    alternative='two-sided')

print(f"U-statistic: {u_statistic} {p_value}")
print(
    f"P-value: {p_value} so there's {'no' if p_value > ALPHA else 'an'} evidence about that graduated people earn more")

Statistic: 976.2589569093834
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 4659.831442217517
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
U-statistic: 7038374891.5 0.0
P-value: 0.0 so there's an evidence about that graduated people earn more


In [29]:
Y1400_graduates_income_yeojohnson = stats.yeojohnson(Y1400_graduates_income['total_income'])
Y1400_non_graduates_income_yeojohnson = stats.yeojohnson(Y1400_graduates_income['total_income'])

Y1400_non_graduates_income_anderson = stats.anderson(Y1400_graduates_income_yeojohnson[0])
Y1400_non_graduates_income_yeojohnson = stats.anderson(Y1400_non_graduates_income_yeojohnson[0])

anderson(Y1400_non_graduates_income_anderson)
anderson(Y1400_non_graduates_income_yeojohnson)

Statistic: 750.5437257745943
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 750.5437257745943
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.


تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود دارد

# 1401

In [30]:
U1401P1 = pd.read_excel('../datasets/U1401.xlsx', sheet_name='U1401P1')
U1401P4S01 = pd.read_excel('../datasets/U1401.xlsx', sheet_name='U1401P4S01')
U1401P4S02 = pd.read_excel('../datasets/U1401.xlsx', sheet_name='U1401P4S02')
U1401P4S03 = pd.read_excel('../datasets/U1401.xlsx', sheet_name='U1401P4S03')
U1401P4S04 = pd.read_excel('../datasets/U1401.xlsx', sheet_name='U1401P4S04')

R1401P1 = pd.read_excel('../datasets/R1401.xlsx', sheet_name='R1401P1')
R1401P4S01 = pd.read_excel('../datasets/R1401.xlsx', sheet_name='R1401P4S01')
R1401P4S02 = pd.read_excel('../datasets/R1401.xlsx', sheet_name='R1401P4S02')
R1401P4S03 = pd.read_excel('../datasets/R1401.xlsx', sheet_name='R1401P4S03')
R1401P4S04 = pd.read_excel('../datasets/R1401.xlsx', sheet_name='R1401P4S04')

In [31]:
Y1401P1 = pd.concat([U1401P1, R1401P1])
Y1401P4S01 = pd.concat([U1401P4S01, R1401P4S01])
Y1401P4S02 = pd.concat([U1401P4S02, R1401P4S02])
Y1401P4S03 = pd.concat([U1401P4S03, R1401P4S03])
Y1401P4S04 = pd.concat([U1401P4S04, R1401P4S04])

Y1401P1['degree'] = Y1401P1['degree'].str.lower()

Y1401P1['degree'].value_counts()

degree
elemantry     38023
secondary     20466
diploma       19428
bachelor      10142
highschool     5912
college        3247
master         2104
other           370
phd             141
Name: count, dtype: int64

In [32]:
Y1401_graduates = Y1401P1[Y1401P1['degree'].isin(['phd', 'master', 'bachelor', 'college'])]
Y1401_non_graduates = Y1401P1[Y1401P1['degree'].isin(['elementary', 'diploma', 'secondary', 'highschool'])]

print(f"Number of graduates in urban: {Y1401_graduates.shape[0]}")
print(f"Number of non-graduates in urban: {Y1401_non_graduates.shape[0]}")

Y1401_graduates = Y1401_graduates[['Address', 'degree']]
Y1401_non_graduates = Y1401_non_graduates[['Address', 'degree']]

Number of graduates in urban: 15634
Number of non-graduates in urban: 45806


In [33]:
Y1401P4S01 = Y1401P4S01[['Address', 'netincome_w_y', 'perk_w_y', 'wage_w_y']]
Y1401P4S02 = Y1401P4S02[['Address', 'income_s_y']]
Y1401P4S03 = Y1401P4S03[
    ['Address', 'income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']]
Y1401P4S04 = Y1401P4S04[['Address', 'subsidy']]

Y1401_incomes = pd.merge(left=Y1401P4S01, right=Y1401P4S02, on='Address', how='outer')
Y1401_incomes = pd.merge(left=Y1401_incomes, right=Y1401P4S03, on='Address', how='outer')
Y1401_incomes = pd.merge(left=Y1401_incomes, right=Y1401P4S04, on='Address', how='outer')

Y1401_incomes[cols_to_sum] = Y1401_incomes[cols_to_sum].apply(pd.to_numeric, errors='coerce')
Y1401_incomes = Y1401_incomes.dropna(thresh=Y1401_incomes.shape[1] - 10)
Y1401_incomes = Y1401_incomes.fillna(0)

Y1401_graduates_income = pd.merge(left=Y1401_graduates, right=Y1401_incomes, on='Address', how='inner')
Y1401_non_graduates_income = pd.merge(left=Y1401_non_graduates, right=Y1401_incomes, on='Address', how='inner')

Y1401_graduates_income['total_income'] = Y1401_graduates_income[cols_to_sum].sum(axis=1)
Y1401_non_graduates_income['total_income'] = Y1401_non_graduates_income[cols_to_sum].sum(axis=1)

In [34]:
Y1401_graduates_income_mean = Y1401_graduates_income['total_income'].mean()

In [35]:
Y1401_non_graduates_income_mean = Y1401_non_graduates_income['total_income'].mean()

In [36]:
print(f"{Y1401_graduates_income_mean - Y1401_non_graduates_income_mean:,.2f}")

345,311,085.48


In [37]:
Y1401_graduates_income.drop(['degree'], axis=1, inplace=True)
Y1401_non_graduates_income.drop(['degree'], axis=1, inplace=True)
Y1401_graduates_income_anderson = stats.anderson(Y1401_graduates_income['total_income'].to_numpy())

anderson(Y1401_graduates_income_anderson)

Y1401_non_graduates_income_anderson = stats.anderson(Y1401_non_graduates_income['total_income'].to_numpy())

anderson(Y1401_non_graduates_income_anderson)

u_statistic, p_value = mannwhitneyu(Y1401_graduates_income['total_income'], Y1401_non_graduates_income['total_income'],
                                    alternative='two-sided')

print(f"U-statistic: {u_statistic} {p_value}")
print(
    f"P-value: {p_value} so there's {'no' if p_value > ALPHA else 'an'} evidence about that graduated people earn more")

Statistic: 1257.8587521749942
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 4255.60946428898
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
U-statistic: 6852695537.0 0.0
P-value: 0.0 so there's an evidence about that graduated people earn more


In [38]:
Y1401_graduates_income_yeojohnson = stats.yeojohnson(Y1401_graduates_income['total_income'])
Y1401_non_graduates_income_yeojohnson = stats.yeojohnson(Y1401_graduates_income['total_income'])

Y1401_non_graduates_income_anderson = stats.anderson(Y1401_graduates_income_yeojohnson[0])
Y1401_non_graduates_income_yeojohnson = stats.anderson(Y1401_non_graduates_income_yeojohnson[0])

anderson(Y1401_non_graduates_income_anderson)
anderson(Y1401_non_graduates_income_yeojohnson)

Statistic: 1252.7373921282488
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 1252.7373921282488
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.


تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود دارد

# All Years

In [39]:
AP1 = pd.concat([U98P1, R98P1, U99P1, R99P1, U1400P1, R1400P1, U1401P1, R1401P1])
AP4S01 = pd.concat([U98P4S01, R98P4S01, U99P4S01, R99P4S01, U1400P4S01, R1400P4S01, U1401P4S01, R1401P4S01])
AP4S02 = pd.concat([U98P4S02, R98P4S02, U99P4S02, R99P4S02, U1400P4S02, R1400P4S02, U1401P4S02, R1401P4S02])
AP4S03 = pd.concat([U98P4S03, R98P4S03, U99P4S03, R99P4S03, U1400P4S03, R1400P4S03, U1401P4S03, R1401P4S03])
AP4S04 = pd.concat([U98P4S04, R98P4S04, U99P4S04, R99P4S04, U1400P4S04, R1400P4S04, U1401P4S04, R1401P4S04])

AP1['degree'] = AP1['degree'].str.lower()

AP1['degree'].value_counts()

A_graduates = AP1[AP1['degree'].isin(['phd', 'master', 'bachelor', 'college'])]
A_non_graduates = AP1[AP1['degree'].isin(['elementary', 'diploma', 'secondary', 'highschool'])]

print(f"Number of graduates in urban: {A_graduates.shape[0]}")
print(f"Number of non-graduates in urban: {A_non_graduates.shape[0]}")

A_graduates

Number of graduates in urban: 62939
Number of non-graduates in urban: 181518


Unnamed: 0,Address,member,relation,gender,age,literacy,studying,degree,occupationalst,maritalst
1,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married
8,10009008108,3,Child,Male,34,literate,No,master,employed,Single
9,10009008108,4,Child,Female,27,literate,Yes,master,unemployed,Single
12,10009008117,3,Child,Male,21,literate,No,college,unemployed,Single
19,10006005516,1,Head,Male,58,literate,No,bachelor,IncomeWOJob,Married
...,...,...,...,...,...,...,...,...,...,...
61712,21702575438,3,Child,Female,21,literate,Yes,bachelor,Student,Single
61716,21702575441,2,Spouse,Female,43,literate,No,college,employed,Married
61717,21702575441,3,Child,Female,19,literate,Yes,college,Student,Single
61725,21702575829,1,Head,Male,48,literate,No,bachelor,employed,Married


In [40]:
AP4S01 = AP4S01[['Address', 'netincome_w_y', 'perk_w_y', 'wage_w_y']]
AP4S02 = AP4S02[['Address', 'income_s_y']]
AP4S03 = AP4S03[
    ['Address', 'income_pension', 'income_rent', 'income_interest', 'income_aid', 'income_resale', 'income_transfer']]
AP4S04 = AP4S04[['Address', 'subsidy']]

A_incomes = pd.merge(left=AP4S01, right=AP4S02, on='Address', how='outer')
A_incomes = pd.merge(left=A_incomes, right=AP4S03, on='Address', how='outer')
A_incomes = pd.merge(left=A_incomes, right=AP4S04, on='Address', how='outer')

A_incomes

Unnamed: 0,Address,netincome_w_y,perk_w_y,wage_w_y,income_s_y,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,subsidy
0,10001000108,,,,,0392000000,,,,,,27300000.0
1,10001000111,5.640000e+08,0.0,5.640000e+08,,,,,,,,16380000.0
2,10001000113,2.400000e+08,0.0,2.400000e+08,,0620000000,,,28000000.0,,,3010000.0
3,10001000113,2.400000e+08,0.0,2.400000e+08,,0620000000,,,28000000.0,,,16380000.0
4,10001000114,,,,,0144000000,,,,,,5460000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2060851,23006709829,,,,300000000.0,0,0,0.0,24600000.0,0.0,0.0,27300000.0
2060852,23006709832,1.104400e+09,180000000.0,9.244000e+08,1170000000,0,540000000,0,96530000.0,0,0,1015000.0
2060853,23006709835,1.111400e+09,18000000.0,1.093400e+09,,0,0,0,129530000.0,0,0,1015000.0
2060854,23006709838,,,,,0,0,0,100700000.0,0,200000000,105000.0


In [41]:
A_incomes[cols_to_sum] = A_incomes[cols_to_sum].apply(pd.to_numeric, errors='coerce')
A_incomes = A_incomes.dropna(thresh=A_incomes.shape[1] - 10)
A_incomes = A_incomes.fillna(0)

A_graduates_income = pd.merge(left=A_graduates, right=A_incomes, on='Address', how='inner')
A_non_graduates_income = pd.merge(left=A_non_graduates, right=A_incomes, on='Address', how='inner')

A_graduates_income['total_income'] = A_graduates_income[cols_to_sum].sum(axis=1)
A_non_graduates_income['total_income'] = A_non_graduates_income[cols_to_sum].sum(axis=1)

A_graduates_income

Unnamed: 0,Address,member,relation,gender,age,literacy,studying,degree,occupationalst,maritalst,...,wage_w_y,income_s_y,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,subsidy,total_income
0,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married,...,0.0,44500000.0,358000000.0,0.0,0.0,0.0,0.0,0.0,10920000.0,4.134200e+08
1,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married,...,0.0,44500000.0,358000000.0,0.0,0.0,0.0,0.0,0.0,10920000.0,4.134200e+08
2,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married,...,0.0,44500000.0,358000000.0,0.0,0.0,0.0,0.0,0.0,5460000.0,4.079600e+08
3,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married,...,0.0,44500000.0,262000000.0,0.0,0.0,0.0,0.0,0.0,10920000.0,3.174200e+08
4,10004004223,2,Spouse,Female,61,literate,No,college,IncomeWOJob,Married,...,0.0,44500000.0,262000000.0,0.0,0.0,0.0,0.0,0.0,10920000.0,3.174200e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2329630,21702575832,1,Head,Male,47,literate,No,bachelor,employed,Married,...,842400000.0,71600000.0,0.0,0.0,6100000.0,0.0,0.0,0.0,16380000.0,1.898880e+09
2329631,21702575832,1,Head,Male,47,literate,No,bachelor,employed,Married,...,842400000.0,71600000.0,0.0,0.0,6100000.0,0.0,0.0,0.0,1960000.0,1.884460e+09
2329632,21702575832,1,Head,Male,47,literate,No,bachelor,employed,Married,...,842400000.0,71600000.0,0.0,0.0,6100000.0,0.0,0.0,0.0,16380000.0,1.898880e+09
2329633,21702575832,1,Head,Male,47,literate,No,bachelor,employed,Married,...,842400000.0,71600000.0,0.0,0.0,6100000.0,0.0,0.0,0.0,1260000.0,1.883760e+09


In [42]:
A_graduates_income_mean = A_graduates_income['total_income'].mean()
A_graduates_income_mean

A_non_graduates_income_mean = A_non_graduates_income['total_income'].mean()
A_non_graduates_income_mean

print(f"Difference between average: {A_graduates_income_mean - A_non_graduates_income_mean:,.2f}")

A_graduates_income.info()
A_graduates_income.drop(['degree'], axis=1, inplace=True)
A_non_graduates_income.drop(['degree'], axis=1, inplace=True)

A_graduates_income_anderson = stats.anderson(A_graduates_income['total_income'].to_numpy())

anderson(A_graduates_income_anderson)

A_non_graduates_income_anderson = stats.anderson(A_non_graduates_income['total_income'].to_numpy())

anderson(A_non_graduates_income_anderson)

A_graduates_income.info()

u_statistic, p_value = mannwhitneyu(A_graduates_income['total_income'], A_graduates_income['total_income'],
                                    alternative='two-sided')

print(f"U-statistic: {u_statistic} {p_value}")
print(
    f"P-value: {p_value} so there's {'no' if p_value > ALPHA else 'an'} evidence about that graduated people earn more")


Difference between average: 204,656,454.89
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2329635 entries, 0 to 2329634
Data columns (total 22 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Address          int64  
 1   member           int64  
 2   relation         object 
 3   gender           object 
 4   age              int64  
 5   literacy         object 
 6   studying         object 
 7   degree           object 
 8   occupationalst   object 
 9   maritalst        object 
 10  netincome_w_y    float64
 11  perk_w_y         float64
 12  wage_w_y         float64
 13  income_s_y       float64
 14  income_pension   float64
 15  income_rent      float64
 16  income_interest  float64
 17  income_aid       float64
 18  income_resale    float64
 19  income_transfer  float64
 20  subsidy          float64
 21  total_income     float64
dtypes: float64(12), int64(3), object(7)
memory usage: 391.0+ MB
Statistic: 59424.79930512281
Critical Values: [0.576 0.656 

In [43]:
A_graduates_income_yeojohnson = stats.yeojohnson(A_graduates_income['total_income'])
A_non_graduates_income_yeojohnson = stats.yeojohnson(A_graduates_income['total_income'])

A_non_graduates_income_anderson = stats.anderson(A_graduates_income_yeojohnson[0])
A_non_graduates_income_yeojohnson = stats.anderson(A_non_graduates_income_yeojohnson[0])

anderson(A_non_graduates_income_anderson)
anderson(A_non_graduates_income_yeojohnson)

Statistic: 55665.25716345012
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.
Statistic: 55665.25716345012
Critical Values: [0.576 0.656 0.787 0.918 1.092]
Significance Levels: [15.  10.   5.   2.5  1. ]
At the 15.0% significance level, data is NOT normal.
At the 10.0% significance level, data is NOT normal.
At the 5.0% significance level, data is NOT normal.
At the 2.5% significance level, data is NOT normal.
At the 1.0% significance level, data is NOT normal.


تفاوت معناداری بین مجموع درآمد سالانه افرادی که به دانشگاه رفته‌اند و افرادی که به دانشگاه نرفته‌اند وجود ندارد