### Importing and Cleaning Data

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

ORIGINAL_DATASET = "WineQT.xlsx"

ORIGINAL_DATA_COLUMNS = { "fixed acidity": float,	"volatile acidity": float,	"citric acid": float,	"residual sugar": float,
                    "chlorides": float,	"free sulfur dioxide": float,	"total sulfur dioxide": float,	"density": float,	"pH": float,
                    "sulphates": float,	"alcohol": float,	"quality": int, "Id": int}

df = pd.read_excel(ORIGINAL_DATASET, dtype=ORIGINAL_DATA_COLUMNS)

X = np.array(df.iloc[:, 0:11].values)
y = np.array(df.iloc[:, 11].values - 3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))

### Training and Testing Naive Bayes

In [2]:
from naive_bayes import *

nb_params = {}
nb_model = naive_bayes_train(X_train.T, y_train, nb_params)

nb_train_predictions = naive_bayes_predict(X_train.T, nb_model)

nb_train_accuracy = np.mean(nb_train_predictions == y_train)

print("Naive Bayes training accuracy: %f" % nb_train_accuracy)

nb_test_predictions = naive_bayes_predict(X_test.T, nb_model)

nb_test_accuracy = np.mean(nb_test_predictions == y_test)

print("Naive Bayes testing accuracy: %f" % nb_test_accuracy)

Naive Bayes training accuracy: 0.543791
Naive Bayes testing accuracy: 0.576720


### Confusion Matrix

In [3]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, nb_test_predictions)

array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  3,  7,  4,  0,  0],
       [ 1, 10, 95, 44,  4,  0],
       [ 0,  6, 37, 88, 28,  0],
       [ 0,  0,  2, 13, 32,  0],
       [ 0,  0,  0,  0,  4,  0]], dtype=int64)

### Error and RMSE

In [4]:
diff = nb_test_predictions - y_test
error = np.mean(np.abs(diff))
rmse = np.sqrt(np.mean(diff*diff))
print("Error of actual value to predicted value: ", error)
print("Root Mean Square Error: ", rmse)

Error of actual value to predicted value:  0.46825396825396826
Root Mean Square Error:  0.7471285419531624


### Saving and Loading Weights

In [5]:
import pickle 

with open('save_nb_model.txt', 'wb') as f:
    pickle.dump(nb_model, f)

with open('save_nb_model.txt', 'rb') as f:
    model = pickle.load(f)
    print(type(model))
    print(model)

<class 'dict'>
{'cond_means': array([[ 8.45      ,  7.87368421,  8.26960486,  8.33333333,  8.74895833,
         8.65833333],
       [ 0.8975    ,  0.69026316,  0.58530395,  0.50990099,  0.38895833,
         0.40916667],
       [ 0.21166667,  0.17631579,  0.24358663,  0.26072607,  0.3909375 ,
         0.41166667],
       [ 2.66666667,  2.98421053,  2.57613982,  2.44026403,  2.671875  ,
         2.55      ],
       [ 0.10533333,  0.10252632,  0.09009119,  0.08523762,  0.0769375 ,
         0.06866667],
       [ 8.16666667, 13.57894737, 16.92401216, 14.99339934, 14.72916667,
         8.75      ],
       [24.5       , 38.        , 55.94072948, 40.01650165, 39.05208333,
        25.91666667],
       [ 0.99768167,  0.99682368,  0.99716547,  0.99661241,  0.99594979,
         0.99555333],
       [ 3.36166667,  3.36842105,  3.30045593,  3.32273927,  3.28989583,
         3.23166667],
       [ 0.55      ,  0.62157895,  0.6093617 ,  0.67831683,  0.73760417,
         0.79583333],
       [ 9.69166667,