# Classification using Naive Bayes
##### Siddhartha Dutta; A70405217037

## Social Network Ads Dataset

In [1]:
# Importing Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter(action='ignore')

In [2]:
# Importing Dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, 1:4].values # Gender, Age, EstimatedSalary
Y = dataset.iloc[:, 4].values # Purchased

In [3]:
# Data Preprocessing
le = LabelEncoder()
X[:, 0] = le.fit_transform(X[:, 0]) # Label Encoding Gender Column

enc = OneHotEncoder(categorical_features=[0])
X = enc.fit_transform(X).toarray()
X = X[:, 1:]

sc = StandardScaler()
X = sc.fit_transform(X) # Applying Standard Scaling to All Independent Variables

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42) # 80-20 Split

In [4]:
# Naive Bayes Classification
classifier = GaussianNB()
classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [5]:
# Predicting Test Set Results
Y_pred = classifier.predict(X_test)
print("Predicted Values for Y:", Y_pred)

Predicted Values for Y: [1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 0
 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 0
 0 0 1 1 0 0]


In [6]:
# Displaying Metrics
print("Classification Report")
print(classification_report(Y_test, Y_pred))

print("\nConfusion Matrix")
print(confusion_matrix(Y_test, Y_pred))

print("\nAccuracy Score:", end = ' ')
print(round(accuracy_score(Y_test, Y_pred),2))

Classification Report
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        52
           1       0.93      0.89      0.91        28

    accuracy                           0.94        80
   macro avg       0.93      0.93      0.93        80
weighted avg       0.94      0.94      0.94        80


Confusion Matrix
[[50  2]
 [ 3 25]]

Accuracy Score: 0.94


## Pima Indians Diabetes Database

In [7]:
# Importing Libraries
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter(action='ignore')

In [8]:
# Importing Dataset
dataset = pd.read_csv('pima-indians-diabetes.csv', header=None)

X = dataset.iloc[:, 0:9] # 9 Independent Variables
Y = dataset.iloc[:, 9] # 1 Dependent Variable

In [9]:
# Data Preprocessing
print("Number of Missing Values Per Column")
print(dataset.isnull().sum())
print("\nReplaced Values")
print(X.mean())
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) # Imputer: Mean Strategy
imputer = imputer.fit(X)
X = imputer.transform(X) # Handling Missing Values using Imputer

sc = StandardScaler()
X = sc.fit_transform(X) # Applying Standard Scaling to All Independent Variables

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42) # 80-20 Split

Number of Missing Values Per Column
0    1
1    4
2    4
3    1
4    5
5    3
6    2
7    8
8    1
9    0
dtype: int64

Replaced Values
0       3.846154
1     120.947644
2      69.111257
3      20.533246
4      79.825688
5    1658.594771
6      31.998564
7       0.473170
8      33.245111
dtype: float64


In [10]:
# Naive Bayes Classification
classifier = GaussianNB()
classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
# Predicting Test Set Results
Y_pred = classifier.predict(X_test)
print("Predicted Values for Y:", Y_pred)

Predicted Values for Y: [0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 0 1 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0
 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 1
 0 0 0 1 1 0 0]


In [12]:
# Displaying Metrics
print("Classification Report")
print(classification_report(Y_test, Y_pred))

print("\nConfusion Matrix")
print(confusion_matrix(Y_test, Y_pred))

print("\nAccuracy Score:", end = ' ')
print(round(accuracy_score(Y_test, Y_pred),2))

Classification Report
              precision    recall  f1-score   support

           0       0.81      0.76      0.78       123
           1       0.61      0.68      0.64        69

    accuracy                           0.73       192
   macro avg       0.71      0.72      0.71       192
weighted avg       0.74      0.73      0.73       192


Confusion Matrix
[[93 30]
 [22 47]]

Accuracy Score: 0.73
