### Classification model using Naive Bayes for salary data( predict wether Salary person is <=50K or >50K  
<li>SalaryData_Train.csv</li>
<li>SalaryData_Test.csv</li>

In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
#Reading dataset file using pandas function & showing top 5 records
df_before = pd.read_csv('C:\\Users\\Raja\\Downloads\\assignments\\naive\\SalaryData_Train.csv')
df_before.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
#Value counts of Salary col <=50K or >50K
df_before['Salary'].value_counts()

 <=50K    22653
 >50K      7508
Name: Salary, dtype: int64

In [4]:
#Actual shape of dataset
df_before.shape

(30161, 14)

In [5]:
# Remove invalid data from table
df_before= df_before[(df_before.astype(str) != ' ?').all(axis=1)]

In [6]:
# Creating a new 'income' column using existing col. 'Salary' categoring high, low =equal 50K per annum
df_before['income'] = df_before.apply(lambda row: 1 if '>50K' in row['Salary'] else 0, axis=1)

In [7]:
#Droping unnecessary col.'Salary','capitalgain','capitalloss'
df_before.drop(['Salary','capitalgain','capitalloss'], axis=1, inplace=True)

In [8]:
df_before.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,hoursperweek,native,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0


In [9]:
# Getting dummies variables for categorical data's features
df_after= pd.get_dummies(df_before,columns=['sex','workclass','education','occupation','relationship','race','native','maritalstatus'], drop_first=True)

In [10]:
#Showing top 5 records
df_after.head(10)

Unnamed: 0,age,educationno,hoursperweek,income,sex_ Male,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,maritalstatus_ Married-AF-spouse,maritalstatus_ Married-civ-spouse,maritalstatus_ Married-spouse-absent,maritalstatus_ Never-married,maritalstatus_ Separated,maritalstatus_ Widowed
0,39,13,40,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
1,50,13,13,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2,38,9,40,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,53,7,40,0,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,28,13,40,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,37,14,40,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
6,49,5,16,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,52,9,45,1,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8,31,14,50,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
9,42,13,40,1,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [11]:
# Using Normalization function 
def norm_func(i):
    x = (i-i.min())/(i.max()-i.min())
    return (x)

In [12]:
# Normalized data frame (considering the numerical part of data)
df_norm = norm_func(df_after.iloc[:,1:])
df_norm.tail(10)

Unnamed: 0,educationno,hoursperweek,income,sex_ Male,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,maritalstatus_ Married-AF-spouse,maritalstatus_ Married-civ-spouse,maritalstatus_ Married-spouse-absent,maritalstatus_ Never-married,maritalstatus_ Separated,maritalstatus_ Widowed
30151,0.333333,0.397959,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30152,0.666667,0.44898,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30153,0.866667,0.102041,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30154,0.866667,0.397959,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30155,0.6,0.397959,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30156,0.733333,0.377551,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30157,0.533333,0.397959,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30158,0.533333,0.397959,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30159,0.533333,0.193878,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30160,0.533333,0.397959,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
#feature_col=data.drop(['income'],axis=1)
#output_target=['income']
X = df_after.drop(['income'],axis=1)
Y = df_after['income']

In [14]:
#Splitting data set into followings using train_test_split functions, test size is 20%
Xtrain, Xtest , ytrain, ytest = train_test_split(X,Y,test_size=0.2, random_state=0)

In [15]:
#Performing preprocessing part 
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() 
  
Xtrain = sc.fit_transform(Xtrain) 
Xtest = sc.transform(Xtest) 

In [16]:
# Applying PCA function on training 
# and testing set of X component 
from sklearn.decomposition import PCA 
  
pca = PCA(n_components = 6) 
  
Xtrain = pca.fit_transform(Xtrain) 
Xtest = pca.transform(Xtest) 
  
explained_variance = pca.explained_variance_ratio_ 

In [17]:
#Creating Gaussian model
model = GaussianNB() # normal distribution

In [18]:
# Building and predicting at the same time 
pred = model.fit(Xtrain,ytrain).predict(Xtest)

In [19]:
# Confusion matrix GaussianNB model
confusion_matrix(ytest,pred) # GaussianNB model

array([[3928,  631],
       [ 552,  922]], dtype=int64)

In [20]:
pd.crosstab(ytest.values.flatten(),pred) # confusion matrix using 

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3928,631
1,552,922


In [21]:
np.mean(pred==ytest.values.flatten()) # 81.24% on testing data, on training data=81.39%

0.8039118183325046