In [40]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [42]:
data = pd.read_csv('SalaryData_Train.csv')
data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [44]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,30161.0,38.438115,13.13483,17.0,28.0,37.0,47.0,90.0
educationno,30161.0,10.121316,2.550037,1.0,9.0,10.0,13.0,16.0
capitalgain,30161.0,1092.044064,7406.466611,0.0,0.0,0.0,0.0,99999.0
capitalloss,30161.0,88.302311,404.121321,0.0,0.0,0.0,0.0,4356.0
hoursperweek,30161.0,40.931269,11.980182,1.0,40.0,40.0,45.0,99.0


### Data Cleaning

In [45]:
data.duplicated().sum()

3258

In [46]:
data.drop_duplicates(inplace = True)

In [47]:
data.shape

(26903, 14)

###### dummy variables

In [48]:
df = pd.get_dummies(data, columns = ['workclass','education','maritalstatus','occupation','relationship',
                                     'race','sex','native','Salary'], drop_first=True)
df.head()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native_ Puerto-Rico,native_ Scotland,native_ South,native_ Taiwan,native_ Thailand,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,Salary_ >50K
0,39,13,2174,0,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,50,13,0,0,13,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,38,9,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,7,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,13,0,0,40,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
X = df.drop('Salary_ >50K', axis=1)
y = df['Salary_ >50K']

In [50]:
y.value_counts()

0    20023
1     6880
Name: Salary_ >50K, dtype: int64

###### Train test split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

###### Feature Scalling

In [52]:
scale = RobustScaler()
X_train = scale.fit_transform(X_train)
X_test  = scale.transform(X_test)
X_train.shape, X_test.shape

((21522, 94), (5381, 94))

In [53]:
X_train

array([[ 0.52631579,  0.25      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.52631579,  0.25      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.47368421,  1.25      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.10526316,  0.        ,  0.        , ..., -1.        ,
         0.        ,  1.        ],
       [-0.47368421, -0.25      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.10526316,  0.75      ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### Modeling

In [54]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.8136


In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87      4005
           1       0.61      0.73      0.67      1376

    accuracy                           0.81      5381
   macro avg       0.76      0.79      0.77      5381
weighted avg       0.83      0.81      0.82      5381



#### SMOTE

In [56]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [57]:
gnb1 = GaussianNB()
gnb1.fit(X_smote, y_smote)
y_pred1 = gnb1.predict(X_test)

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred1)))
print(classification_report(y_test, y_pred1))

Model accuracy score: 0.7976
              precision    recall  f1-score   support

           0       0.92      0.80      0.85      4005
           1       0.58      0.80      0.67      1376

    accuracy                           0.80      5381
   macro avg       0.75      0.80      0.76      5381
weighted avg       0.83      0.80      0.81      5381



#### Hyperparameter tuning

In [58]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)}

grid = GridSearchCV(estimator = gnb1,
    param_grid = param_grid_nb,
    verbose=1, cv=10, n_jobs=-1)

grid.fit(X_smote, y_smote)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


GridSearchCV(cv=10, estimator=GaussianNB(), n_jobs=-1,
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.848035...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             verbose=1)

In [59]:
grid.best_params_

{'var_smoothing': 1e-09}

In [60]:
grid.best_score_

0.8101201062674155

In [61]:
y_pred2 = grid.predict(X_test)

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred2)))
print(classification_report(y_test, y_pred2))

Model accuracy score: 0.7976
              precision    recall  f1-score   support

           0       0.92      0.80      0.85      4005
           1       0.58      0.80      0.67      1376

    accuracy                           0.80      5381
   macro avg       0.75      0.80      0.76      5381
weighted avg       0.83      0.80      0.81      5381

