In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


In [2]:
#Import data
income_data = pd.read_csv('income_data_df_eda.csv')

In [3]:
#Checking for missing values
income_data.isnull().sum()

Age               0
Workclass         0
Final_Weight      0
Education         0
Education_Num     0
Marital_Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital_gain      0
Capital_loss      0
Hours_per_week    0
Native_country    0
Income            0
dtype: int64

In [4]:
income_data.nunique()

Age                  73
Workclass             9
Final_Weight      21648
Education            16
Education_Num        16
Marital_Status        7
Occupation           15
Relationship          6
Race                  5
Sex                   2
Capital_gain        119
Capital_loss         92
Hours_per_week       94
Native_country       42
Income                2
dtype: int64

In [5]:
income_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   Final_Weight    32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education_Num   32561 non-null  int64 
 5   Marital_Status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital_gain    32561 non-null  int64 
 11  Capital_loss    32561 non-null  int64 
 12  Hours_per_week  32561 non-null  int64 
 13  Native_country  32561 non-null  object
 14  Income          32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [6]:
# Encoding columns for object 
var = ['Workclass', 'Education', 'Marital_Status', 'Occupation','Relationship','Race','Sex','Native_country']
for i in var:
    income_data[i] = income_data[i].astype('|S')
    income_data[i] = label_encoder.fit_transform(income_data[i])
income_data # getting final encoded dataframe

Unnamed: 0,Age,Workclass,Final_Weight,Education,Education_Num,Marital_Status,Occupation,Relationship,Race,Sex,Capital_gain,Capital_loss,Hours_per_week,Native_country,Income
0,90,0,77053,11,9,6,0,1,4,0,0,4356,40,39,0
1,82,4,132870,11,9,6,4,1,4,0,0,4356,18,39,0
2,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39,0
3,54,4,140359,5,4,0,7,4,4,0,0,3900,40,39,0
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,4,310152,15,10,4,11,1,4,1,0,0,40,39,0
32557,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
32558,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
32559,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0


In [7]:
income_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Age             32561 non-null  int64
 1   Workclass       32561 non-null  int32
 2   Final_Weight    32561 non-null  int64
 3   Education       32561 non-null  int32
 4   Education_Num   32561 non-null  int64
 5   Marital_Status  32561 non-null  int32
 6   Occupation      32561 non-null  int32
 7   Relationship    32561 non-null  int32
 8   Race            32561 non-null  int32
 9   Sex             32561 non-null  int32
 10  Capital_gain    32561 non-null  int64
 11  Capital_loss    32561 non-null  int64
 12  Hours_per_week  32561 non-null  int64
 13  Native_country  32561 non-null  int32
 14  Income          32561 non-null  int64
dtypes: int32(8), int64(7)
memory usage: 2.7 MB


In [8]:
#Checking for Multicolinearity
variables = income_data[['Age', 'Workclass', 'Final_Weight', 'Education', 'Education_Num',
       'Marital_Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital_gain', 'Capital_loss', 'Hours_per_week', 'Native_country']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

vif

Unnamed: 0,VIF,Features
0,8.521265,Age
1,8.47526,Workclass
2,4.031573,Final_Weight
3,9.210325,Education
4,18.379729,Education_Num
5,3.976179,Marital_Status
6,3.710596,Occupation
7,2.612286,Relationship
8,17.578379,Race
9,4.441675,Sex


Features with VIF greater than 10 will be dropped

In [9]:
# now, we'll drop columns which have vif>10
income_data = income_data.drop(['Education_Num','Race','Hours_per_week','Native_country'], axis=1)
income_data.head()

Unnamed: 0,Age,Workclass,Final_Weight,Education,Marital_Status,Occupation,Relationship,Sex,Capital_gain,Capital_loss,Income
0,90,0,77053,11,6,0,1,0,0,4356,0
1,82,4,132870,11,6,4,1,0,0,4356,0
2,66,0,186061,15,6,0,4,0,0,4356,0
3,54,4,140359,5,0,7,4,0,0,3900,0
4,41,4,264663,15,5,10,3,0,0,3900,0


In [10]:
#Checking and removing outliers
income_data = income_data[(np.abs(stats.zscore(income_data)) < 3).all(axis=1)]
income_data

Unnamed: 0,Age,Workclass,Final_Weight,Education,Marital_Status,Occupation,Relationship,Sex,Capital_gain,Capital_loss,Income
1470,62,2,159908,9,2,1,5,0,0,1258,0
1471,72,2,144515,10,2,4,0,1,0,1258,0
1472,76,0,224680,14,2,0,0,1,0,1258,0
1473,66,4,350498,15,2,14,0,1,0,1258,0
1474,33,2,262042,11,0,1,3,0,0,1138,0
...,...,...,...,...,...,...,...,...,...,...,...
32556,22,4,310152,15,4,11,1,1,0,0,0
32557,27,4,257302,7,2,13,5,0,0,0,0
32558,40,4,154374,11,2,7,0,1,0,0,1
32559,58,4,151910,11,6,1,4,0,0,0,0


In [11]:
income_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30428 entries, 1470 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Age             30428 non-null  int64
 1   Workclass       30428 non-null  int32
 2   Final_Weight    30428 non-null  int64
 3   Education       30428 non-null  int32
 4   Marital_Status  30428 non-null  int32
 5   Occupation      30428 non-null  int32
 6   Relationship    30428 non-null  int32
 7   Sex             30428 non-null  int32
 8   Capital_gain    30428 non-null  int64
 9   Capital_loss    30428 non-null  int64
 10  Income          30428 non-null  int64
dtypes: int32(6), int64(5)
memory usage: 2.1 MB


In [12]:
#standardization the numeric features
from sklearn.preprocessing import StandardScaler
income_data[['Age', 'Final_Weight','Capital_gain','Capital_loss']] = StandardScaler().fit_transform(income_data[['Age', 'Final_Weight','Capital_gain','Capital_loss']])

In [13]:
#Splitting Data for Training and Testing

data = income_data.values
X,y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)  # splitting in the ration 80:20