In [6]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn import linear_model
import matplotlib.pyplot as plt

%matplotlib inline

### READ IN DATA AND ADD COLUMNS

In [12]:
df = pd.read_csv("census-income.data.csv", sep=",", names=['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status','occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

### REMOVE WHITESPACE FROM COLUMN VALUES

In [13]:
df.replace(' ?', np.nan, inplace=True)
df.replace(' <=50K', '<=50K', inplace=True)
df.replace(' >50K', '>50K', inplace=True)

### REMOVE MISSING VALUES


In [14]:
df.isnull().sum()


age                   0
workclass          1836
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         1843
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      583
income                0
dtype: int64

In [15]:
df = df[df['workclass'].notnull()]
df = df[df['occupation'].notnull()]
df = df[df['native-country'].notnull()]

100* df.isnull().sum() / len(df)

age                0.0
workclass          0.0
fnlwgt             0.0
education          0.0
educational-num    0.0
marital-status     0.0
occupation         0.0
relationship       0.0
race               0.0
gender             0.0
capital-gain       0.0
capital-loss       0.0
hours-per-week     0.0
native-country     0.0
income             0.0
dtype: float64

### BALANCE THE DATA

In [16]:
below_50 = df[df['income'] == '<=50K']
above_50 = df[df['income'] == '>50K']
frames = [below_50.head(10000), above_50]
df_balanced = pd.concat(frames)

In [48]:
len(df_balanced[df_balanced['income'] == '>50K'])

7508

### CONVERT NON NUMERIC DATA TO NUMERIC DATA AND REMOVE IRRELEVANT ROWS

In [49]:
gender_split = pd.get_dummies(df_balanced['gender'])
occupation_split = pd.get_dummies(df_balanced['occupation'])
rel_split = pd.get_dummies(df_balanced['relationship'])
race_split = pd.get_dummies(df_balanced['race'])
work_split = pd.get_dummies(df_balanced['workclass'])
country_split = pd.get_dummies(df_balanced['native-country'])

In [50]:
df_balanced_dummies = pd.concat([df_balanced,gender_split, occupation_split, rel_split,race_split, work_split, country_split], axis=1)

In [51]:
col_names = list(df_balanced_dummies.columns)
col_names_strip = [a.strip() for a in col_names]
df_balanced_dummies.columns = col_names_strip
df_balanced_dummies['income'] = df_balanced_dummies['income'].map({'<=50K':0, '>50K':1})
df_balanced_dummies = df_balanced_dummies.select_dtypes(['number'])


### NORMALIZE THE DATASET

In [52]:
df_balanced_dummies['fnlwgt'] = df_balanced_dummies['fnlwgt'].pipe(zscore)
df_balanced_dummies['age'] = df_balanced_dummies['age'].pipe(zscore)
df_balanced_dummies['hours-per-week'] = df_balanced_dummies['hours-per-week'].pipe(zscore)

### RUN FEATURE SELECTION AND SELECT BEST FEATURES

### RUN MODELS