# Does a **college degree** affect people’s *income*?

### Import Libray & Dataset

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as st

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Overview Dataset

In [None]:
# use "marketing_campaign.csv" dataset

df = pd.read_csv('/content/sample_data/marketing_campaign.csv', sep='\t')
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,...,3,0,0,0,0,0,0,3,11,0


In [None]:
df.shape

(2240, 29)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [None]:
# Check if any value is NaN (empty)

df.isna().sum()[['Education', 'Income']]

Education     0
Income       24
dtype: int64

In [None]:
# Fill NaN with median
# New data set = df_new

df_new = df.fillna(df['Income'].median())

In [None]:
# Mean old df vs new df after replace NaN with mean

print(np.mean(df['Income']))
print(np.mean(df_new['Income']))

52247.25135379061
52237.97544642857


In [None]:
df_new['Education'].value_counts()

Graduation    1127
PhD            486
Master         370
2n Cycle       203
Basic           54
Name: Education, dtype: int64

#### Categorize Education

In [None]:
basic = df_new[df_new['Education'] == 'Basic']
graduation = df_new[df_new['Education'] == 'Graduation']
second_cycle = df_new[df_new['Education'] == '2n Cycle']
master = df_new[df_new['Education'] == 'Master']
phd = df_new[df_new['Education'] == 'PhD']

### Central Tendency Measurement

#### MEAN

In [None]:
# mean of all data income

df_new['Income'].mean()

52237.97544642857

In [None]:
# mean of income by each education

df_new.groupby(['Education'])['Income'].mean()

Education
2n Cycle      47688.583744
Basic         20306.259259
Graduation    52707.305679
Master        52896.777027
PhD           56096.303498
Name: Income, dtype: float64

#### MEDIAN

In [None]:
# mean of all data income

df_new['Income'].median()

51381.5

In [None]:
# median of income by each education

df_new.groupby(['Education'])['Income'].median()

Education
2n Cycle      46891.0
Basic         20744.0
Graduation    51651.0
Master        51044.5
PhD           55005.0
Name: Income, dtype: float64

#### MODE

In [None]:
# mode of income

st.mode(df_new['Income'])

ModeResult(mode=array([51381.5]), count=array([24]))

In [None]:
# mode of income by each education

print(st.mode(basic['Income']))
print(st.mode(graduation['Income']))
print(st.mode(second_cycle['Income']))
print(st.mode(master['Income']))
print(st.mode(phd['Income']))


ModeResult(mode=array([7500.]), count=array([2]))
ModeResult(mode=array([51381.5]), count=array([11]))
ModeResult(mode=array([7500.]), count=array([4]))
ModeResult(mode=array([51381.5]), count=array([5]))
ModeResult(mode=array([51381.5]), count=array([5]))


### Spread Measurement

#### VARIANCE

In [None]:
# variance of income

np.var(df_new['Income'])

626619369.4507393

In [None]:
# variance of income by each education

df_new.groupby(['Education'])['Income'].var()

Education
2n Cycle      4.821932e+08
Basic         3.887606e+07
Graduation    7.862153e+08
Master        4.008620e+08
PhD           4.207461e+08
Name: Income, dtype: float64

#### STANDARD DEVIATION

In [None]:
# standard deviation all income
df_new['Income'].std()

25037.955890621957

In [None]:
# standard deviation income per category of education
df_new.groupby(['Education'])['Income'].std()

Education
2n Cycle      21958.898270
Basic          6235.066773
Graduation    28039.531293
Master        20021.539473
PhD           20512.097287
Name: Income, dtype: float64

#### RANGE

In [None]:
# range income
df_new['Income'].max()-df_new['Income'].min()

664936.0

In [None]:
# range income per education
df_new.groupby(['Education'])['Income'].max()-df_new.groupby(['Education'])['Income'].min()

Education
2n Cycle       89047.0
Basic          26945.0
Graduation    664936.0
Master        151173.0
PhD           158374.0
Name: Income, dtype: float64

#### QUARTILE

In [None]:
# Quartile of income

print("q1 \t\t\t:", df_new['Income'].quantile(0.25))
print("q2 \t\t\t:", df_new['Income'].quantile(0.5))
print("q3 \t\t\t:", df_new['Income'].quantile(0.75))
print("interquartile range \t:", df_new['Income'].quantile(0.75)-df_new['Income'].quantile(0.25))

q1 			: 35538.75
q2 			: 51381.5
q3 			: 68289.75
interquartile range 	: 32751.0


##### Quartile by category education

In [None]:
# q1
df_new.groupby(['Education'])['Income'].quantile(0.25)

Education
2n Cycle      28095.00
Basic         15405.25
Graduation    34925.50
Master        37853.75
PhD           40654.25
Name: Income, dtype: float64

In [None]:
# q2
df_new.groupby(['Education'])['Income'].quantile(0.5)

Education
2n Cycle      46891.0
Basic         20744.0
Graduation    51651.0
Master        51044.5
PhD           55005.0
Name: Income, dtype: float64

In [None]:
# q3
df_new.groupby(['Education'])['Income'].quantile(0.75)

Education
2n Cycle      65100.5
Basic         24882.0
Graduation    69737.0
Master        66596.0
PhD           69084.0
Name: Income, dtype: float64

In [None]:
# interquartile
df_new.groupby(['Education'])['Income'].quantile(0.75)-df_new.groupby(['Education'])['Income'].quantile(0.25)

Education
2n Cycle      37005.50
Basic          9476.75
Graduation    34811.50
Master        28742.25
PhD           28429.75
Name: Income, dtype: float64

### Hypothesis Testing

H0 : college degree **doesn’t affect** people’s income

H1 : college degree **affects** people’s income

In [None]:
basic = df_new[df_new['Education'] == 'Basic']
graduation = df_new[df_new['Education'] == 'Graduation']
second_cycle = df_new[df_new['Education'] == '2n Cycle']
master = df_new[df_new['Education'] == 'Master']
phd = df_new[df_new['Education'] == 'PhD']

In [None]:
anova_test = st.f_oneway(basic['Income'],graduation['Income'],second_cycle['Income'],master['Income'],phd['Income'])

In [None]:
anova_test.pvalue

1.075202108156874e-22

#### Testing Result

In [None]:
if anova_test.pvalue > 0.05:
    print('H0 is accepted : college degree doesn’t affect people’s income')
else:
    print('H0 is declined : college degree affect people’s income')

H0 is declined : college degree affect people’s income
