# Naive Baye's ML Algorithms Implementation

In [83]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [84]:
# independent and dependent variables
X, y = load_iris(return_X_y=True)

In [85]:
pd.DataFrame(X, columns=load_iris().feature_names).head(), pd.DataFrame(y, columns=['target']).head()

(   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                5.1               3.5                1.4               0.2
 1                4.9               3.0                1.4               0.2
 2                4.7               3.2                1.3               0.2
 3                4.6               3.1                1.5               0.2
 4                5.0               3.6                1.4               0.2,
    target
 0       0
 1       0
 2       0
 3       0
 4       0)

In [86]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
y_train

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])

In [88]:
# use GaussianNB from sklearn
from sklearn.naive_bayes import GaussianNB
# create a GaussianNB classifier
gnb = GaussianNB()
# fit the classifier to the training data
gnb.fit(X_train, y_train)
# make predictions on the test data
y_pred = gnb.predict(X_test)

In [89]:
# calculate the performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# print the performance metrics
print(f"Accuracy: {accuracy:.2f}")  
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
# print the F1 score
print(f"F1 Score: {f1:.2f}")
# print the confusion matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


# load the tips dataset from seaborn library to predict time

In [90]:
import seaborn as sns
tip = sns.load_dataset("tips")
tip.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [91]:
tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [92]:
tip['sex'].value_counts(), tip['day'].value_counts(), tip['time'].value_counts(), tip['smoker'].value_counts()

(sex
 Male      157
 Female     87
 Name: count, dtype: int64,
 day
 Sat     87
 Sun     76
 Thur    62
 Fri     19
 Name: count, dtype: int64,
 time
 Dinner    176
 Lunch      68
 Name: count, dtype: int64,
 smoker
 No     151
 Yes     93
 Name: count, dtype: int64)

In [93]:
# Independent and dependent varianbles
X = tip.drop(columns=['time'])
y = tip['time'].map({'Lunch': 0, 'Dinner': 1})

In [94]:
X.head(), y.head()

(   total_bill   tip     sex smoker  day  size
 0       16.99  1.01  Female     No  Sun     2
 1       10.34  1.66    Male     No  Sun     3
 2       21.01  3.50    Male     No  Sun     3
 3       23.68  3.31    Male     No  Sun     2
 4       24.59  3.61  Female     No  Sun     4,
 0    1
 1    1
 2    1
 3    1
 4    1
 Name: time, dtype: category
 Categories (2, int64): [0, 1])

In [95]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
228,13.28,2.72,Male,No,Sat,2
208,24.27,2.03,Male,Yes,Sat,2
96,27.28,4.0,Male,Yes,Fri,2
167,31.71,4.5,Male,No,Sun,4
84,15.98,2.03,Male,No,Thur,2


In [97]:
y_train.head()

228    1
208    1
96     1
167    1
84     0
Name: time, dtype: category
Categories (2, int64): [0, 1]

In [98]:
# feature Encoding(Label Encoding and One-Hot Encoding)
from sklearn.preprocessing import LabelEncoder
#label encoding
lev1 = LabelEncoder()
lev2 = LabelEncoder()
# binary feature for train data
X_train['sex'] = lev1.fit_transform(X_train['sex'])
X_train['smoker'] = lev2.fit_transform(X_train['smoker'])
# binary feature for test data
X_test['sex'] = lev1.transform(X_test['sex'])
X_test['smoker'] = lev2.transform(X_test['smoker'])

In [99]:
# use one-hot encoding for day because it has more than two categories
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# create a column transformer
ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop="first"),[4])],
    remainder='passthrough'  # leave the rest of the columns unchanged
)

In [100]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
# the column transformer on the training data
X_train = ct.fit_transform(X_train)
# the column transformer on the test data
X_test = ct.transform(X_test)

In [104]:
X_train

array([[ 1.  ,  0.  ,  0.  , 13.28,  2.72,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 24.27,  2.03,  1.  ,  1.  ,  2.  ],
       [ 0.  ,  0.  ,  0.  , 27.28,  4.  ,  1.  ,  1.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  , 31.71,  4.5 ,  1.  ,  0.  ,  4.  ],
       [ 0.  ,  0.  ,  1.  , 15.98,  2.03,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  , 19.49,  3.51,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  1.  , 13.03,  2.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  7.25,  5.15,  1.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  , 17.82,  1.75,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  , 17.26,  2.74,  1.  ,  0.  ,  3.  ],
       [ 1.  ,  0.  ,  0.  , 15.69,  3.  ,  1.  ,  1.  ,  3.  ],
       [ 0.  ,  1.  ,  0.  , 29.85,  5.14,  0.  ,  0.  ,  5.  ],
       [ 0.  ,  1.  ,  0.  , 17.31,  3.5 ,  0.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  , 23.33,  5.65,  1.  ,  1.  ,  2.  ],
       [ 0.  ,  0.  ,  0.  , 16.27,  2.5 ,  0.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.

In [105]:
# use GaussianNB from sklearn
from sklearn.naive_bayes import GaussianNB
# create a GaussianNB classifier
gnb = GaussianNB()
# fit the classifier to the training data
gnb.fit(X_train, y_train)


In [108]:
# predict the test data
y_pred = gnb.predict(X_test)

In [110]:
# performance metrics
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
# print the performance metrics
print(f"Accuracy: {accuracy:.2f}")  
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.96
[[15  0]
 [ 2 32]]


In [112]:
# convert the predicted values which are in 0 and 1 to Lunch and Dinner
y_pred = pd.Series(y_pred).map({0: 'Lunch', 1: 'Dinner'})
y_pred

0     Dinner
1     Dinner
2     Dinner
3     Dinner
4      Lunch
5     Dinner
6      Lunch
7      Lunch
8     Dinner
9      Lunch
10    Dinner
11    Dinner
12     Lunch
13    Dinner
14     Lunch
15     Lunch
16     Lunch
17    Dinner
18    Dinner
19     Lunch
20    Dinner
21    Dinner
22    Dinner
23    Dinner
24    Dinner
25     Lunch
26    Dinner
27    Dinner
28    Dinner
29    Dinner
30    Dinner
31    Dinner
32    Dinner
33    Dinner
34    Dinner
35     Lunch
36     Lunch
37     Lunch
38     Lunch
39    Dinner
40    Dinner
41     Lunch
42    Dinner
43    Dinner
44     Lunch
45    Dinner
46    Dinner
47    Dinner
48     Lunch
dtype: object