# 1. Description of Adult dataset

In [6]:
import pandas as pd

In [7]:
df_adult = pd.read_csv("adult.csv")
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [8]:
df_adult.shape

(32561, 15)

In [9]:
df_adult.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [10]:
df_adult.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [11]:
df_adult.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

# 2. Data cleaning

## 2.1 Delete missing values

In [12]:
def delete_missing_val(df_adult, col_names):
    for col_name in list(col_names):
        notmissing = df_adult[col_name] != '?'
        df_adult = df_adult[notmissing]
    return df_adult
df_adult = delete_missing_val(df_adult, df_adult.columns)
df_adult

  res_values = method(rvalues)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


## 2.2 Deal with unwanted columns

In [13]:
print(df_adult['capital.gain'].value_counts())
print(df_adult['capital.loss'].value_counts())

0        27624
15024      337
7688       270
7298       240
99999      148
         ...  
401          1
22040        1
4931         1
1455         1
1639         1
Name: capital.gain, Length: 118, dtype: int64
0       28735
1902      194
1977      162
1887      155
1848       50
        ...  
419         1
1411        1
1539        1
2472        1
2467        1
Name: capital.loss, Length: 90, dtype: int64


### 2.2.1 Delete some unwanted columns

In [14]:
df_adult.drop(labels = ['workclass','capital.gain', 'fnlwgt', 'capital.loss', 'education', 'marital.status',\
                        'relationship',], axis = 1, inplace = True)

## 2.3 Count the quantity of values in native country column

In [15]:
df_adult['native.country'].value_counts()

United-States                 27504
Mexico                          610
Philippines                     188
Germany                         128
Puerto-Rico                     109
Canada                          107
El-Salvador                     100
India                           100
Cuba                             92
England                          86
Jamaica                          80
South                            71
China                            68
Italy                            68
Dominican-Republic               67
Vietnam                          64
Guatemala                        63
Japan                            59
Columbia                         56
Poland                           56
Taiwan                           42
Haiti                            42
Iran                             42
Portugal                         34
Nicaragua                        33
Peru                             30
Greece                           29
France                      

## 2.4 Extract the US group

In [16]:
country_class = df_adult.groupby('native.country')
theUSgroup = country_class.get_group('United-States')

### 2.4.1 Store the cleaned new data to a new csv file.

In [17]:
theUSgroup.to_csv('cleaned_theUS_adult.csv')

## 2.5 Check the new dataset is clean

In [18]:
theUSgroup.isnull().sum()

age               0
education.num     0
occupation        0
race              0
sex               0
hours.per.week    0
native.country    0
income            0
dtype: int64

# 3. Analysis

In [19]:
theUSgroup['education.num'].value_counts()

9     9209
10    6260
13    4618
14    1484
11    1233
7      957
12     939
6      752
15     488
4      437
5      350
8      331
16     314
3       78
2       39
1       15
Name: education.num, dtype: int64

It shows that most people in this sample have the education num of 9-10.

## 3.1 Explore the relationship between educational attainment and income.

In [20]:
education_class = theUSgroup.groupby('education.num')

In [21]:
income_series = education_class['income'].value_counts('<=50K')

print("The percentage of people with high income in each group")
print()
for index in income_series.index:
    if (index[1] == '>50K'):
        print("{}% people in education number {} group have >50K income".format((income_series[index]*100).round(2),index[0]))
print("\n\nRespectively, the percentages of those with lower income are as follows:\n")

for index in income_series.index:
    if (index[1] == '<=50K'):
        print("{}% people in education number {} group have <=50K income".format((income_series[index]*100).round(2),index[0]))

The percentage of people with high income in each group

2.56% people in education number 2 group have >50K income
3.85% people in education number 3 group have >50K income
6.64% people in education number 4 group have >50K income
6.29% people in education number 5 group have >50K income
7.58% people in education number 6 group have >50K income
5.75% people in education number 7 group have >50K income
7.85% people in education number 8 group have >50K income
16.73% people in education number 9 group have >50K income
20.24% people in education number 10 group have >50K income
26.2% people in education number 11 group have >50K income
25.67% people in education number 12 group have >50K income
42.77% people in education number 13 group have >50K income
57.21% people in education number 14 group have >50K income
75.2% people in education number 15 group have >50K income
76.11% people in education number 16 group have >50K income


Respectively, the percentages of those with lower income a

## 3.2 Explore the relationship between working hours and the income

In [22]:
theUSgroup.groupby('income')['hours.per.week'].mean()

income
<=50K    39.340826
>50K     45.750536
Name: hours.per.week, dtype: float64

In [23]:
theUSgroup.groupby('income')['hours.per.week'].median()

income
<=50K    40
>50K     42
Name: hours.per.week, dtype: int64

### 3.2.1 Association between factors themselves:  Educational attainment &  Working hours

In [24]:
education_class['hours.per.week'].mean().round()

education.num
1     27.0
2     33.0
3     37.0
4     40.0
5     39.0
6     37.0
7     34.0
8     35.0
9     41.0
10    39.0
11    42.0
12    41.0
13    43.0
14    45.0
15    48.0
16    48.0
Name: hours.per.week, dtype: float64

## 3.3 Explore the income of different occupation

In [25]:
career_income_dict = theUSgroup.groupby('occupation')['income'].value_counts('<=50K').round(3).to_dict()
print("The percentage of high income people in each occupation:\n")
for k in career_income_dict:
    if (k[1] == '>50K'):
        print(k[0]+ ": %" +str(round(career_income_dict[k]*100,2)))

The percentage of high income people in each occupation:

Adm-clerical: %13.3
Armed-Forces: %11.1
Craft-repair: %23.3
Exec-managerial: %48.7
Farming-fishing: %12.6
Handlers-cleaners: %6.1
Machine-op-inspct: %13.3
Other-service: %3.8
Priv-house-serv: %1.1
Prof-specialty: %44.7
Protective-serv: %33.5
Sales: %27.6
Tech-support: %30.2
Transport-moving: %20.5


## 3.4 Explore the relationship between race and the income

In [26]:
theUSgroup['race'].value_counts()

White                 24218
Black                  2629
Asian-Pac-Islander      273
Amer-Indian-Eskimo      271
Other                   113
Name: race, dtype: int64

In [27]:
race_income_pair = theUSgroup.groupby('race')['income'].value_counts('>50K').to_dict()
print("The association between races and income\n")
print("The percentage of high income people of each race: ")
for k in race_income_pair:
    if k[1] == '>50K':
        print(k[0], ":", round(race_income_pair[k], 3))

The association between races and income

The percentage of high income people of each race: 
Amer-Indian-Eskimo : 0.122
Asian-Pac-Islander : 0.245
Black : 0.13
Other : 0.115
White : 0.27


## 3.5 Explore the relationship between age and the income

In [28]:
theUSgroup.groupby('income')['age'].mean()

income
<=50K    36.629724
>50K     44.000429
Name: age, dtype: float64

### 3.5.1 The association between age and educational attainment

In [29]:
education_age_frame = pd.DataFrame(education_class['age'].mean().round()).reset_index()
education_age_frame['education.num'].corr(education_age_frame['age'])

-0.2781503442203127

In [30]:
education_age_frame

Unnamed: 0,education.num,age
0,1,42.0
1,2,56.0
2,3,53.0
3,4,50.0
4,5,42.0
5,6,38.0
6,7,32.0
7,8,31.0
8,9,39.0
9,10,36.0
