In [142]:
import pandas as pd  # For data manipulation
import joblib  # For saving and loading models

# Import machine learning components
from sklearn.naive_bayes import GaussianNB  
import sklearn.metrics as skmet  # For model evaluation

# Import SMOTE for handling imbalanced data
from imblearn.over_sampling import SMOTE  

**Import dataset**

In [143]:
df = pd.read_csv('preprocessTrainData.csv')

In [144]:
testData = pd.read_csv('preprocessTestData.csv')

In [11]:
df.head()

Unnamed: 0,workclass_Private Sector,workclass_Self-Employed,education_Bachelor’s Degree,education_College,education_Doctorate,education_High School,education_High School Graduate,education_Master’s Degree,education_Middle School,education_No Education,education_Primary,education_Professional Degree,maritalstatus_Married,maritalstatus_Married but Separated,maritalstatus_Separated,maritalstatus_Single,maritalstatus_Widowed,occupation_Clerical,occupation_Domestic Work,occupation_Labor,occupation_Management,occupation_Manufacturing,occupation_Military,occupation_Professional,occupation_Sales,occupation_Security,occupation_Service,occupation_Skilled Trades,occupation_Technical,occupation_Transportation,relationship_Extended Family,relationship_Independent,relationship_Single,relationship_Spouse,sex_M,age,educationno,capitalgain,capitalloss,hoursperweek,Salary
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,39,13,2174,0,40,0
1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,50,13,0,0,13,0
2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,38,9,0,0,40,0
3,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,53,7,0,0,40,0
4,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,28,13,0,0,40,0


**Check whether the target variable is balanced or imbalanced.**

In [145]:
df['Salary'].value_counts(normalize = True)

Salary
0    0.741003
1    0.258997
Name: proportion, dtype: float64

# SMOTE 

In [146]:
smote = SMOTE(random_state = 0)

In [147]:
independent = df.iloc[:, 0:len(df.columns) - 1]

dependent = df.Salary

In [148]:
X_train, y_train = smote.fit_resample(independent, dependent)

In [149]:
X_train.shape

(38092, 40)

In [150]:
y_train.shape

(38092,)

In [151]:
y_train.values.sum() # ones 

19046

In [152]:
y_train.size - y_train.values.sum() # Zeros 

19046

# Model Building 

In [112]:
classifierNB = GaussianNB() 

In [113]:
classifierNB.fit(X_train, y_train) # model Build

**keep the test data and test the model**

In [153]:
test_independent = testData.iloc[:, 0:len(df.columns) - 1]

test_dependent = testData.Salary

In [115]:
test_pred_m = classifierNB.predict(test_independent)

In [116]:
skmet.accuracy_score(test_dependent, test_pred_m)

0.8004365223717715

In [117]:
skmet.confusion_matrix(test_dependent, test_pred_m)

array([[8278, 2025],
       [ 718, 2724]], dtype=int64)

**keep the train data and test the model**

In [118]:
train_pred_m = classifierNB.predict(independent)

In [119]:
skmet.accuracy_score(dependent, train_pred_m)

0.7976500797572268

In [120]:
skmet.confusion_matrix(dependent, train_pred_m)

array([[15273,  3773],
       [ 1428,  5229]], dtype=int64)

#### Deploy the model 

In [123]:
joblib.dump(classifierNB, 'salaryPredictionModel')

['salaryPredictionModel']

In [124]:
salaryPredictionModel = joblib.load('salaryPredictionModel')

In [135]:
predictValues = [1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
         0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1, 25,  7,  0,  0, 40]

In [137]:
salaryPredictionModel.predict([predictValues])



array([0], dtype=int64)

In [159]:
test_independent.head(1).T




Unnamed: 0,0
workclass_Private Sector,1
workclass_Self-Employed,0
education_Bachelor’s Degree,0
education_College,0
education_Doctorate,0
education_High School,1
education_High School Graduate,0
education_Master’s Degree,0
education_Middle School,0
education_No Education,0


In [162]:
df.head(1).columns

Index(['workclass_Private Sector', 'workclass_Self-Employed',
       'education_Bachelor’s Degree', 'education_College',
       'education_Doctorate', 'education_High School',
       'education_High School Graduate', 'education_Master’s Degree',
       'education_Middle School', 'education_No Education',
       'education_Primary', 'education_Professional Degree',
       'maritalstatus_Married', 'maritalstatus_Married but Separated',
       'maritalstatus_Separated', 'maritalstatus_Single',
       'maritalstatus_Widowed', 'occupation_Clerical',
       'occupation_Domestic Work', 'occupation_Labor', 'occupation_Management',
       'occupation_Manufacturing', 'occupation_Military',
       'occupation_Professional', 'occupation_Sales', 'occupation_Security',
       'occupation_Service', 'occupation_Skilled Trades',
       'occupation_Technical', 'occupation_Transportation',
       'relationship_Extended Family', 'relationship_Independent',
       'relationship_Single', 'relationship_Spous