In [86]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [18]:
df = pd.read_csv("census-income.data.csv", sep=",", names=['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status','occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

In [19]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [20]:
df['gender'].value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [23]:
df.isnull().sum()

age                   0
workclass          1836
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         1843
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      583
income                0
dtype: int64

In [24]:
df.replace(' ?', np.nan, inplace=True)
df.replace(' <=50K', '<=50K', inplace=True)
df.replace(' >50K', '>50K', inplace=True)


In [25]:
100* df.isnull().sum() / len(df)

age                0.000000
workclass          5.638647
fnlwgt             0.000000
education          0.000000
educational-num    0.000000
marital-status     0.000000
occupation         5.660146
relationship       0.000000
race               0.000000
gender             0.000000
capital-gain       0.000000
capital-loss       0.000000
hours-per-week     0.000000
native-country     1.790486
income             0.000000
dtype: float64

In [26]:
len(df)

32561

In [27]:
df = df[df['workclass'].notnull()]
df = df[df['occupation'].notnull()]
df = df[df['native-country'].notnull()]

100* df.isnull().sum() / len(df)

age                0.0
workclass          0.0
fnlwgt             0.0
education          0.0
educational-num    0.0
marital-status     0.0
occupation         0.0
relationship       0.0
race               0.0
gender             0.0
capital-gain       0.0
capital-loss       0.0
hours-per-week     0.0
native-country     0.0
income             0.0
dtype: float64

In [28]:
below_50 = df[df['income'] == '<=50K']
above_50 = df[df['income'] == '>50K']

In [29]:
len(below_50)

22654

In [53]:
frames = [below_50.head(10000), above_50]

In [54]:
df_actual = pd.concat(frames)

In [60]:
gender_split = pd.get_dummies(df_actual['gender'])

In [61]:
occupation_split = pd.get_dummies(df_actual['occupation'])

In [62]:
rel_split = pd.get_dummies(df_actual['relationship'])


In [63]:
race_split = pd.get_dummies(df_actual['race'])

In [64]:
work_split = pd.get_dummies(df_actual['workclass'])

In [66]:
country_split = pd.get_dummies(df_actual['native-country'])

In [67]:
df_dummies = pd.concat([df_actual,gender_split, occupation_split, rel_split,race_split, work_split, country_split], axis=1)

In [72]:
df_dummies.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income', ' Female', ' Male', ' Adm-clerical', ' Armed-Forces',
       ' Craft-repair', ' Exec-managerial', ' Farming-fishing',
       ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service',
       ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales',
       ' Tech-support', ' Transport-moving', ' Husband', ' Not-in-family',
       ' Other-relative', ' Own-child', ' Unmarried', ' Wife',
       ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White', ' Federal-gov', ' Local-gov', ' Private', ' Self-emp-inc',
       ' Self-emp-not-inc', ' State-gov', ' Without-pay', ' Cambodia',
       ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic',
       ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany',
  

In [73]:
col_names = list(df_dummies.columns)

In [77]:
col_names_strip = [a.strip() for a in col_names]
df_dummies.columns = col_names_strip

In [80]:
df_dummies['income'].value_counts()

<=50K    10000
>50K      7508
Name: income, dtype: int64

In [81]:
df_dummies['income'] = df_dummies['income'].map({'<=50K':0, '>50K':1})

In [82]:
df_dummies['income'].value_counts()

0    10000
1     7508
Name: income, dtype: int64

In [83]:
df_dummies = df_dummies.select_dtypes(['number'])

In [84]:
df_dummies.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,Female,Male,Adm-clerical,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,39,77516,13,2174,0,40,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
df_dummies['fnlwgt'] = df_dummies['fnlwgt'].pipe(zscore)
df_dummies['age'] = df_dummies['age'].pipe(zscore)
df_dummies['hours-per-week'] = df_dummies['hours-per-week'].pipe(zscore)

In [100]:
df_dummies.head(20)

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,Female,Male,Adm-clerical,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,-0.063093,-1.067551,13,2174,0,-0.18257,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,0.799532,-1.012165,13,0,0,-2.4667,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.141513,0.252644,9,0,0,-0.18257,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,1.034794,0.434956,7,0,0,-0.18257,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.925718,1.425967,13,0,0,-0.18257,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.219934,0.911508,14,0,0,-0.18257,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0.721112,-0.277413,5,0,0,-2.212908,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12,-1.317821,-0.63979,13,0,0,-1.028544,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
13,-0.612036,0.151075,12,0,0,0.663405,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
15,-0.455195,0.537853,4,0,0,0.240418,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
