### Importing and Cleaning Data

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

ORIGINAL_DATASET = "WineQT.xlsx"

ORIGINAL_DATA_COLUMNS = { "fixed acidity": float,	"volatile acidity": float,	"citric acid": float,	"residual sugar": float,
                    "chlorides": float,	"free sulfur dioxide": float,	"total sulfur dioxide": float,	"density": float,	"pH": float,
                    "sulphates": float,	"alcohol": float,	"quality": int, "Id": int}

df = pd.read_excel(ORIGINAL_DATASET, dtype=ORIGINAL_DATA_COLUMNS)

X = np.array(df.iloc[:, 0:11].values)
y = np.array(df.iloc[:, 11].values - 3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))

### Training and Testing Naive Bayes

In [2]:
from naive_bayes import *

nb_params = {}
nb_model = naive_bayes_train(X_train.T, y_train, nb_params)

nb_train_predictions = naive_bayes_predict(X_train.T, nb_model)

nb_train_accuracy = np.mean(nb_train_predictions == y_train)

print("Naive Bayes training accuracy: %f" % nb_train_accuracy)

nb_test_predictions = naive_bayes_predict(X_test.T, nb_model)

nb_test_accuracy = np.mean(nb_test_predictions == y_test)

print("Naive Bayes testing accuracy: %f" % nb_test_accuracy)

(765, 11) (378, 11) (765,) (378,)
Counter({3: 313, 2: 307, 4: 104, 1: 23, 5: 12, 0: 6})
Naive Bayes training accuracy: 0.567320
Naive Bayes testing accuracy: 0.563492


### Confusion Matrix

In [3]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, nb_test_predictions)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   2,   2,   5,   1,   0],
       [  1,  11, 108,  52,   4,   0],
       [  0,  13,  37,  84,  15,   0],
       [  0,   0,   0,  18,  19,   2],
       [  0,   0,   0,   2,   2,   0]], dtype=int64)

### Error and RMSE

In [5]:
diff = nb_test_predictions - y_test
error = np.mean(np.abs(diff))
rmse = np.sqrt(np.mean(diff*diff))
print("Error of actual value to predicted value: ", error)
print("Root Mean Square Error: ", rmse)

Error of actual value to predicted value:  0.5079365079365079
Root Mean Square Error:  0.8099905283919412
