# NB methods

## Imports

In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from torch.utils.data import Subset

from src.data.preprocessing import preprocessing
from src.utils.const import DATA_DIR

from src.utils.const import SEED

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import final.parquet

In [None]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

## Select categorical columns

In [None]:
index_to_drop = final.columns[2:21]

In [None]:
final.drop(columns=index_to_drop, inplace=True)

In [None]:
def split(data):
    train_tmp, test = train_test_split(data, test_size=0.2, random_state=SEED)
    train, val = train_test_split(train_tmp, test_size=0.1, random_state=SEED)

    return train, test, val

In [None]:
target = 'rating_mean'

X = final.loc[:, final.columns != target]
y = pd.cut(final.loc[:, target], bins=10, labels=False)
X_train, X_test, X_val = split(X)
y_train, y_test, y_val = split(y)

## Gaussian naive bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train).predict(X_test)
print("Avg accuracy: ", gnb.score(X_test, y_test))
print("Mean of the Gaussian Estimators")
print(gnb.theta_)
print("Std Dev of the Gaussian Estimators")
print(gnb.sigma_)

## Quadratic discriminant analysis

In [None]:
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
qda.fit(X_train, y_train).predict(X_test)
print("Avg accuracy: ", qda.score(X_test, y_test))
print("QDA means per class")
print(qda.means_)
print("QDA covariance per class")
print(qda.covariance_)