# Naive Bayes Classifier
https://scikit-learn.org/stable/modules/naive_bayes.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB

#Provides a way to split the data into train and test data sets.
from sklearn.model_selection import train_test_split 

#Provides a way to calculate metrics on the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#from sklearn.metrics import roc_curve,  roc_auc_score

### Bayes' Theorem

$$ P(c|x) = \frac{P(x|c)P(c)}{P(x)}$$

P(c|x) = posterier probability of class (c, target)  
P(c) = prior probability of class  
P(x|c) = likelihood, which is the probability of the predictor given class  
P(x) = the prior probability of predictor

$$ P(c|x) = \frac{P(x_1|c)P(x_2|c)P(x_3|c)P(x_4|c)P(x_5|c)P(c)}{P(x)}$$

In [2]:
def plot_cm(y_pred, y_test):
    cm = confusion_matrix(y_pred, y_test)
    fig = plt.figure(figsize=(5,5))
    heatmap = sns.heatmap(cm, annot=True, fmt='.2f', cmap='RdYlGn')
    plt.ylabel('True label')
    plt.xlabel('Predicted Label')

In [4]:
long_census = pd.read_csv('adult_census/final_data_set.csv')
X = long_census.iloc[:,5:107]
y = long_census['salary']
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=.20, 
                                                    random_state=32)

In [9]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [10]:
#plot_cm(y_pred, y_test)

In [11]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      4445
           1       0.74      0.55      0.63      2068

    accuracy                           0.80      6513
   macro avg       0.78      0.73      0.74      6513
weighted avg       0.79      0.80      0.79      6513



In [12]:
land_train = pd.read_csv('landsat_train.csv')
land_test = pd.read_csv('landsat_test.csv')

X_train = land_train.iloc[:,0:36]
y_train = land_train.iloc[:,36:37]

X_test = land_test.iloc[:,0:36]
y_test = land_test.iloc[:,36:37]

In [15]:
clf =MultinomialNB()
clf.fit(X_train, np.ravel(y_train))
y_pred = clf.predict(X_test)

In [16]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           1       0.99      0.97      0.98       470
           2       0.88      0.98      0.93       200
           3       0.83      0.66      0.73       505
           4       0.04      0.31      0.08        29
           5       0.70      0.76      0.73       219
           7       0.80      0.65      0.72       577

    accuracy                           0.77      2000
   macro avg       0.71      0.72      0.70      2000
weighted avg       0.84      0.77      0.80      2000



In [21]:
golf = pd.read_csv('golf.csv')

In [23]:
golf.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,PlayGolf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes


In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
X = golf.iloc[:,0:4]
y = golf['PlayGolf']

transformer = OneHotEncoder().fit(X)
X = transformer.transform(X).toarray()

In [27]:
X

array([[0., 1., 0., 0., 1., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
       [0., 0., 1., 1., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 0., 1., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 1.]])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.20, random_state=32)

In [31]:
clf =GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [32]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00         2
         Yes       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

