# Gaussian Naive Bayes


In [1]:
import pandas as pd
train = pd.read_csv('prep_train.csv').filter(items=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Distance from Residence to Work', 'Work load Average/day ', 'Education', 'Weight', 'Absent'])
train.head(10)

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Distance from Residence to Work,Work load Average/day,Education,Weight,Absent
0,1.0,0.0,1.0,0.0,0.659574,0.194471,0.0,0.653846,1.0
1,0.0,0.0,1.0,0.0,0.170213,0.194471,0.0,0.807692,0.0
2,0.5,0.0,1.0,0.0,0.978723,0.194471,0.0,0.634615,1.0
3,1.0,0.0,1.0,0.0,0.0,0.194471,0.0,0.230769,1.0
4,0.5,0.0,1.0,0.0,0.659574,0.194471,0.0,0.653846,1.0
5,0.5,0.0,0.0,0.0,0.978723,0.194471,0.0,0.634615,1.0
6,1.0,0.0,0.0,0.0,1.0,0.194471,0.0,0.461538,1.0
7,0.5,0.0,0.0,0.0,0.957447,0.194471,0.0,0.173077,1.0
8,1.0,0.0,0.0,0.0,0.148936,0.194471,0.0,0.75,1.0
9,1.0,0.0,0.0,0.0,0.12766,0.194471,1.0,0.615385,1.0


In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

param_grid = {
    'var_smoothing'                : [1e-9, 1e-10, 1e-8, 1e-11, 1e-7, 1e-6, 1e-5, 1e-12]
    
}


X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)

grid = GridSearchCV(GaussianNB(),param_grid,refit=True,verbose=3, scoring='accuracy')
grid.fit(X_train,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] var_smoothing=1e-09 .............................................
[CV] ................. var_smoothing=1e-09, score=0.771, total=   0.0s
[CV] var_smoothing=1e-09 .............................................
[CV] ................. var_smoothing=1e-09, score=0.836, total=   0.0s
[CV] var_smoothing=1e-09 .............................................
[CV] ................. var_smoothing=1e-09, score=0.853, total=   0.0s
[CV] var_smoothing=1e-10 .............................................
[CV] ................. var_smoothing=1e-10, score=0.771, total=   0.0s
[CV] var_smoothing=1e-10 .............................................
[CV] ................. var_smoothing=1e-10, score=0.836, total=   0.0s
[CV] var_smoothing=1e-10 .............................................
[CV] ................. var_smoothing=1e-10, score=0.853, total=   0.0s
[CV] var_smoothing=1e-08 .............................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    0.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid='warn',
             n_jobs=None,
             param_grid={'var_smoothing': [1e-09, 1e-10, 1e-08, 1e-11, 1e-07,
                                           1e-06, 1e-05, 1e-12]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [3]:
print(grid.best_params_)
predictions = grid.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))

{'var_smoothing': 1e-09}
[[ 15  11]
 [  2 122]]
0.9133333333333333
