In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
!pip install duckdb

Collecting duckdb
  Using cached duckdb-1.1.3-cp312-cp312-win_amd64.whl.metadata (781 bytes)
Using cached duckdb-1.1.3-cp312-cp312-win_amd64.whl (11.0 MB)
Installing collected packages: duckdb
Successfully installed duckdb-1.1.3


In [2]:
import duckdb

In [7]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
duckdb.query("select distinct(time) from df")

┌─────────────────────────┐
│          time           │
│ enum('lunch', 'dinner') │
├─────────────────────────┤
│ Lunch                   │
│ Dinner                  │
└─────────────────────────┘

In [9]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [10]:
x = df[["total_bill","tip"]]
x.head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [25]:
y = df['time']
y

0      Dinner
1      Dinner
2      Dinner
3      Dinner
4      Dinner
        ...  
239    Dinner
240    Dinner
241    Dinner
242    Dinner
243    Dinner
Name: time, Length: 244, dtype: category
Categories (2, object): ['Lunch', 'Dinner']

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
encoder = OneHotEncoder(handle_unknown = 'ignore')

In [13]:
x_encoded = encoder.fit_transform(df[["sex", "smoker"]])

In [14]:
x_encoded

<244x4 sparse matrix of type '<class 'numpy.float64'>'
	with 488 stored elements in Compressed Sparse Row format>

In [15]:
x_encoded.shape

(244, 4)

In [18]:
x = pd.concat([x, pd.DataFrame(x_encoded.toarray(), columns=encoder.get_feature_names_out())], axis=1)

In [20]:
x

Unnamed: 0,total_bill,tip,sex_Female,sex_Male,smoker_No,smoker_Yes
0,16.99,1.01,1.0,0.0,1.0,0.0
1,10.34,1.66,0.0,1.0,1.0,0.0
2,21.01,3.50,0.0,1.0,1.0,0.0
3,23.68,3.31,0.0,1.0,1.0,0.0
4,24.59,3.61,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
239,29.03,5.92,0.0,1.0,1.0,0.0
240,27.18,2.00,1.0,0.0,0.0,1.0
241,22.67,2.00,0.0,1.0,0.0,1.0
242,17.82,1.75,0.0,1.0,1.0,0.0


In [23]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test, y_train, y_test = train_test_split(x,y) 

In [27]:
x_train

Unnamed: 0,total_bill,tip,sex_Female,sex_Male,smoker_No,smoker_Yes
84,15.98,2.03,0.0,1.0,1.0,0.0
154,19.77,2.00,0.0,1.0,1.0,0.0
166,20.76,2.24,0.0,1.0,1.0,0.0
120,11.69,2.31,0.0,1.0,1.0,0.0
12,15.42,1.57,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...
36,16.31,2.00,0.0,1.0,1.0,0.0
5,25.29,4.71,0.0,1.0,1.0,0.0
17,16.29,3.71,0.0,1.0,1.0,0.0
94,22.75,3.25,1.0,0.0,1.0,0.0


In [28]:
y_train

84      Lunch
154    Dinner
166    Dinner
120     Lunch
12     Dinner
        ...  
36     Dinner
5      Dinner
17     Dinner
94     Dinner
241    Dinner
Name: time, Length: 183, dtype: category
Categories (2, object): ['Lunch', 'Dinner']

In [30]:
from sklearn.preprocessing import StandardScaler

In [32]:
scaler = StandardScaler()
scaler

In [35]:
x_train_scaled = scaler.fit_transform(x_train)
x_train_scaled

array([[-0.41149334, -0.70328534, -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [ 0.03762924, -0.725944  , -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [ 0.15494622, -0.54467468, -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       ...,
       [-0.37475772,  0.5655999 , -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [ 0.3907652 ,  0.21816704,  1.31580518, -1.31580518,  0.80538727,
        -0.80538727],
       [ 0.38128504, -0.725944  , -0.75999093,  0.75999093, -1.2416387 ,
         1.2416387 ]])

In [36]:
x_test_scaled = scaler.transform(x_test)
x_test_scaled

array([[-0.35105732, -0.725944  , -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [-0.60939168,  0.78463366,  1.31580518, -1.31580518, -1.2416387 ,
         1.2416387 ],
       [ 2.45507005,  1.33599451, -0.75999093,  0.75999093, -1.2416387 ,
         1.2416387 ],
       [ 0.023409  ,  0.02934483,  1.31580518, -1.31580518,  0.80538727,
        -0.80538727],
       [ 0.31610894, -0.06128983,  1.31580518, -1.31580518, -1.2416387 ,
         1.2416387 ],
       [ 0.82448252, -1.10358842,  1.31580518, -1.31580518,  0.80538727,
        -0.80538727],
       [ 0.07436486,  0.14263815, -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [-0.43637876, -0.55222757,  1.31580518, -1.31580518,  0.80538727,
        -0.80538727],
       [ 2.28442717,  0.02934483, -0.75999093,  0.75999093, -1.2416387 ,
         1.2416387 ],
       [ 0.88017846,  0.11997949, -0.75999093,  0.75999093,  0.80538727,
        -0.80538727],
       [ 1.82226937,  1.6683216 ,  1.31580518, -1.

In [37]:
from sklearn.naive_bayes import GaussianNB

In [53]:
m1 = GaussianNB()
m1

In [59]:
m1.fit(x_train_scaled,y_train)

In [60]:
y_pred_m1 = m1.predict(x_test_scaled)
y_pred_m1

array(['Dinner', 'Dinner', 'Dinner', 'Lunch', 'Dinner', 'Lunch', 'Dinner',
       'Lunch', 'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Dinner',
       'Dinner', 'Lunch', 'Dinner', 'Dinner', 'Dinner', 'Dinner',
       'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Dinner',
       'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Lunch', 'Dinner',
       'Dinner', 'Dinner', 'Lunch', 'Dinner', 'Dinner', 'Lunch', 'Dinner',
       'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Lunch', 'Dinner',
       'Dinner', 'Dinner', 'Dinner', 'Lunch', 'Dinner', 'Dinner',
       'Dinner', 'Dinner', 'Lunch', 'Dinner', 'Dinner', 'Dinner',
       'Dinner', 'Dinner', 'Lunch', 'Lunch', 'Lunch'], dtype='<U6')

In [56]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [61]:
classification_report_m1 = classification_report(y_pred_m1, y_test)
print("classification_report_m1: ","\n\n",classification_report_m1)
confusion_matrix_m1 = confusion_matrix(y_pred_m1, y_test)
print("confusion_matrix_m1: ","\n\n",confusion_matrix_m1)

classification_report_m1:  

               precision    recall  f1-score   support

      Dinner       0.89      0.83      0.86        48
       Lunch       0.50      0.62      0.55        13

    accuracy                           0.79        61
   macro avg       0.69      0.72      0.71        61
weighted avg       0.81      0.79      0.79        61

confusion_matrix_m1:  

 [[40  8]
 [ 5  8]]


In [62]:
accuracy = accuracy_score(y_test, y_pred_m1)
print('Accuracy:', accuracy)

Accuracy: 0.7868852459016393


In [63]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5)
grid_search.fit(x_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [50]:
best_params

{'var_smoothing': 1e-09}

In [51]:
best_model

In [65]:
y_pred_grid_cv = grid_search.predict(x_test_scaled)

In [66]:
classification_report_y_pred_grid_cv = classification_report(y_pred_grid_cv, y_test)
print("classification_report_y_pred_grid_cv: ","\n\n",classification_report_y_pred_grid_cv)
confusion_matrix_y_pred_grid_cv = confusion_matrix(y_pred_grid_cv, y_test)
print("confusion_matrix_m1: ","\n\n",confusion_matrix_y_pred_grid_cv)

classification_report_y_pred_grid_cv:  

               precision    recall  f1-score   support

      Dinner       0.89      0.83      0.86        48
       Lunch       0.50      0.62      0.55        13

    accuracy                           0.79        61
   macro avg       0.69      0.72      0.71        61
weighted avg       0.81      0.79      0.79        61

confusion_matrix_m1:  

 [[40  8]
 [ 5  8]]
