In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns

!pip install dmba
import dmba
from dmba import classificationSummary, gainsChart, liftChart

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

In [3]:
## Remember to replace with your local filepath
census = pd.read_csv('C:\\Users\\maha.jayapal\\Downloads\\census.csv')

In [4]:
## keep copy of original dataframe in case we need to revert back later
census_original = census

In [5]:
## drop columns that are not likely to be valuable or relevant
census = census.drop(columns=['fnlwgt','education-num', 'capital-gain', 'capital-loss', 'hours-per-week',
                             'occupation', 'relationship', 'race'])

In [6]:
## drop '?' values
census = census.drop(census.loc[census['workclass'] == '?'].index)
census = census.drop(census.loc[census['native-country'] == '?'].index)

In [7]:
## shape of the dataset after removing the null values
census.shape

(45232, 7)

In [10]:
## This project will be using income as the indicator for if someone is likely to be a mid-level or high-level donor
census['income'] = census['income'].replace({'<=50K': 'Mid-Level', '>50K': 'High-Level'})

In [9]:
census['marital-status'] = census['marital-status'].replace({'Married-civ-spouse': 'Married', 
                                  'Married-spouse-absent': 'Married', 
                                  'Married-AF-spouse': 'Married'})
census['marital-status'].value_counts()

Married          21641
Never-married    14605
Divorced          6298
Separated         1411
Widowed           1277
Name: marital-status, dtype: int64

In [11]:
census['marital-status'] = census['marital-status'].replace({'Never-married': 'Single', 
                                  'Divorced': 'Single','Separated': 'Single', 'Widowed': 'Single'})
census['marital-status'].value_counts()

Single     23591
Married    21641
Name: marital-status, dtype: int64

In [12]:
census['workclass'] = census['workclass'].replace({'Local-gov': 'Government', 
                                                   'Federal-gov': 'Government', 
                                                   'State-gov': 'Government',
                                                   
                                                   'Self-emp-not-inc': 'Self-emp',
                                                   'Self-emp-inc': 'Self-emp',
                                                   
                                                   'Without-pay': 'No-income',
                                                   'Never-worked': 'No-income',
                                                   
                                                   '?': 'Unknown'})

census['workclass'].value_counts()

Private       33307
Government     6452
Self-emp       5442
No-income        31
Name: workclass, dtype: int64

In [13]:
census['native-country'] = np.where((census['native-country'] != 'United-States') & 
                                    (census['native-country'] != '?'), 'Non-US', census['native-country'])

census['native-country'] = census['native-country'].replace({'?': 'Unknown'})

census['native-country'].value_counts()

United-States    41302
Non-US            3930
Name: native-country, dtype: int64

In [14]:
census['education'] = census['education'].replace({'10th': 'DNF HS', 
                                                   '7th-8th': 'DNF HS', 
                                                   '9th': 'DNF HS',
                                                   '11th': 'DNF HS', 
                                                   '9th': 'DNF HS',
                                                   '12th': 'DNF HS', 
                                                   '5th-6th': 'DNF HS',
                                                   '1st-4th': 'DNF HS', 
                                                   'Preschool': 'DNF HS',
                                                  
                                                   'HS-grad': 'HS',
                                                   'Some-college': 'HS',
                                                   
                                                   'Assoc-voc': 'Assoc',
                                                   'Assoc-acdm': 'Assoc'})

census['education'].value_counts()

HS             24686
Bachelors       7570
DNF HS          5667
Assoc           3466
Masters         2514
Prof-school      785
Doctorate        544
Name: education, dtype: int64

In [15]:
## saving a new dataframe with one hot encoding for categorical features
census_encoded = pd.get_dummies(data=census, columns=['workclass', 'education', 'marital-status', 'sex',
                                                     'native-country'])

In [16]:
## must remove one encoded column to avoid multicollinearity due to dummy variable trap
census_encoded = census_encoded.drop(columns=['workclass_Private', 'education_HS', 
                                             'marital-status_Single', 'sex_Male',
                                             'native-country_United-States'])

In [20]:
census_encoded.head()

Unnamed: 0,age,income,workclass_Government,workclass_No-income,workclass_Self-emp,education_Assoc,education_Bachelors,education_DNF HS,education_Doctorate,education_Masters,education_Prof-school,marital-status_Married,sex_Female,native-country_Non-US
0,25,Mid-Level,0,0,0,0,0,1,0,0,0,0,0,0
1,38,Mid-Level,0,0,0,0,0,0,0,0,0,1,0,0
2,28,High-Level,1,0,0,1,0,0,0,0,0,1,0,0
3,44,High-Level,0,0,0,0,0,0,0,0,0,1,0,0
5,34,Mid-Level,0,0,0,0,0,1,0,0,0,0,0,0


In [17]:
train, test = train_test_split(census_encoded, test_size=0.2, random_state=42, stratify=census_encoded['income'])

In [18]:
train.shape

(36185, 14)

In [19]:
test.shape

(9047, 14)

## Modeling

In [40]:
classes = ['high-level', 'mid-level']

In [21]:
census_X = census_encoded.drop(columns=['income'])

In [28]:
census_y = census_encoded['income']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(census_X, census_y,
                                                     stratify=census_y, 
                                                     test_size=0.2, random_state =1)

In [52]:
print('Shape of training features:', X_train.shape)
print('Shape of testing features:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training features: (36185, 13)
Shape of testing features: (9047, 13)
Shape of training label: (36185,)
Shape of training label: (9047,)


### Logistic Regression

In [51]:
logit_regCV = LogisticRegressionCV(solver='liblinear', cv=5)
logit_regCV.fit(X_train, y_train)

pd.set_option('display.width', 95)
pd.set_option('display.precision',3)
pd.set_option('display.max_columns', 33)
print('intercept ', logit_regCV.intercept_[0], '\n')

# could use display() to present this cleaner, but for illustration, leaving as-is
print(pd.DataFrame({'coeff': logit_regCV.coef_[0]}, index=census_X.columns).transpose())

pd.reset_option('display.width')
pd.reset_option('display.precision')
pd.reset_option('display.max_columns')

# confusion matrix
classificationSummary(y_train, logit_regCV.predict(X_train), class_names=classes)

intercept  3.6076584657697706 

         age  workclass_Government  workclass_No-income  workclass_Self-emp  education_Assoc  \
coeff -0.024                -0.004                0.086               0.062           -0.439   

       education_Bachelors  education_DNF HS  education_Doctorate  education_Masters  \
coeff               -1.273             1.338               -1.806             -1.702   

       education_Prof-school  marital-status_Married  sex_Female  native-country_Non-US  
coeff                 -2.095                  -2.106       0.415                  0.398  
Confusion Matrix (Accuracy 0.8122)

           Prediction
    Actual high-level  mid-level
high-level       3923       5043
 mid-level       1752      25467


In [50]:
y_pred = logit_regCV.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Confusion Matrix : 
 [[ 963 1279]
 [ 473 6332]]
Accuracy :  0.8063446446335802


### Decision Tree

In [53]:
# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=1)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [54]:
y_pred = dtc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Confusion Matrix : 
 [[1151 1091]
 [ 727 6078]]
Accuracy :  0.7990494086437493


### Random Forest

In [56]:
# Building Random Forest model 
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [57]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Confusion Matrix : 
 [[1092 1150]
 [ 644 6161]]
Accuracy :  0.8017022217309605


### Naive Bayes

In [58]:
# Building Naive Bayes model 
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [59]:
y_pred = nb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Confusion Matrix : 
 [[1108 1134]
 [ 818 5987]]
Accuracy :  0.7842378689068199


### KNN

In [60]:
# Building KNN model 
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [61]:
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Confusion Matrix : 
 [[1113 1129]
 [ 742 6063]]
Accuracy :  0.7931911130761579


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
