In [1]:
# Dataset: https://archive.ics.uci.edu/ml/datasets/Adult
# Goal is to predict whether a person makes over 50K a year.

In [2]:
# This kind of prediction can be used by banks for targeted marketing.

In [3]:
# load libraries
import pandas as pd
import numpy as np

In [4]:
# load the data

df = pd.read_csv("./sample_data/adult.csv")

In [5]:
# EDA

In [6]:
# step 1: sneak peak data
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [7]:
# step 2: identify datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
# step 3: describe columns
df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [9]:
# identify if there are null values in data
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [10]:
# list of all columns
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [11]:
# list of features and target column name
features = [
    'age', 'workclass', 'fnlwgt', 'education', 'education.num',
    'marital.status', 'occupation', 'relationship', 'race', 'sex',
    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'
]

target = ['income']

In [12]:
# value counts for target 
df[target].value_counts()

income
<=50K     24720
>50K       7841
dtype: int64

In [13]:
# identify categorical and numerical columns

numerical_features = [ 'age',  'fnlwgt',  'education.num', 'capital.gain', 'capital.loss',  'hours.per.week']
categorical_features = ['workclass',  'education', 'marital.status', 'occupation', 'relationship',  'race', 'sex','native.country', 'income']

In [14]:
# processing of categorical features - using label encoding
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df[categorical_features] = df[categorical_features].apply(labelEncoder.fit_transform)
df


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,0,77053,11,9,6,0,1,4,0,0,4356,40,39,0
1,82,4,132870,11,9,6,4,1,4,0,0,4356,18,39,0
2,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39,0
3,54,4,140359,5,4,0,7,4,4,0,0,3900,40,39,0
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,4,310152,15,10,4,11,1,4,1,0,0,40,39,0
32557,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
32558,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
32559,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0


In [15]:
# confirm value counts on target variable
df[target].value_counts()

income
0         24720
1          7841
dtype: int64

In [16]:
# scaling numerical features - using min max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,1.000000,0,0.043987,11,0.533333,6,0,1,4,0,0.0,1.000000,0.397959,39,0
1,0.890411,4,0.081896,11,0.533333,6,4,1,4,0,0.0,1.000000,0.173469,39,0
2,0.671233,0,0.118021,15,0.600000,6,0,4,2,0,0.0,1.000000,0.397959,39,0
3,0.506849,4,0.086982,5,0.200000,0,7,4,4,0,0.0,0.895317,0.397959,39,0
4,0.328767,4,0.171404,15,0.600000,5,10,3,4,0,0.0,0.895317,0.397959,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.068493,4,0.202298,15,0.600000,4,11,1,4,1,0.0,0.000000,0.397959,39,0
32557,0.136986,4,0.166404,7,0.733333,2,13,5,4,0,0.0,0.000000,0.377551,39,0
32558,0.315068,4,0.096500,11,0.533333,2,7,0,4,1,0.0,0.000000,0.397959,39,1
32559,0.561644,4,0.094827,11,0.533333,6,1,4,4,0,0.0,0.000000,0.397959,39,0


In [17]:
# identify input columns and target column
X = df[features]
y = df[target]

In [18]:
# split the data in training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
y_train = y_train.values.ravel()

In [19]:
# machine learning - different models

In [20]:
# decision tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)
y_pred = decisionTree.predict(X_test)
print('accuracy score:', (accuracy_score(y_test, y_pred)))

accuracy score: 0.8082545141874462


In [21]:
# random forest tree

from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier()
randomForest.fit(X_train, y_train)
y_pred = randomForest.predict(X_test)
print('accuracy score:', (accuracy_score(y_test, y_pred)))

accuracy score: 0.854931826556934


In [22]:
# knn

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print('accuracy score:', (accuracy_score(y_test, y_pred)))

accuracy score: 0.8242230684191131


In [23]:
# svm

from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print('accuracy score:', (accuracy_score(y_test, y_pred)))

accuracy score: 0.7607173565900995


In [24]:
# logistic regression

from sklearn.linear_model import LogisticRegression
logisticsRegression = LogisticRegression(solver='lbfgs', max_iter=1000)
logisticsRegression.fit(X_train, y_train)
y_pred = logisticsRegression.predict(X_test)
print('accuracy score:', (accuracy_score(y_test, y_pred)))


accuracy score: 0.8199238422798182


In [25]:
# based on the above experiment, random forest seems like the best model with > 85 % accuracy