In [None]:
from matplotlib import pyplot as plt
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# NB methods

## Imports

In [69]:
import os
from typing import List, Tuple
import seaborn as sns
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE
from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB

from src.utils.const import DATA_DIR
from sklearn.preprocessing import MinMaxScaler, normalize


### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import final.parquet

In [None]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

## Select categorical columns

In [None]:
index_to_drop = final.columns[2:21]

In [None]:
final.drop(columns=index_to_drop, inplace=True)

In [None]:
final.columns

### Splitting train and test

In [None]:
train_data, test_data = train_test_split(final, test_size=0.2)

In [None]:
train_data.columns

In [None]:
target = 'rating_mean'
X_train = train_data.loc[:, train_data.columns != target]
y_train = pd.cut(train_data.loc[:, target], bins=9, labels=False)
X_test = test_data.loc[:, train_data.columns != target]
y_test = pd.cut(test_data.loc[:, target], bins=9, labels=False)

## Balancing Dataset

#### SMOTE

In [None]:
print(np.unique(y_train, return_counts=True))
oversample=SMOTE()
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
print(np.unique(y_train_over, return_counts=True))

### RandomOverSampler

In [None]:
print(np.unique(y_train, return_counts=True))
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
print(np.unique(y_train_over, return_counts=True))

### Check balancing

In [None]:
sns.histplot(
    data=y_train,
    discrete=True
).set(xlabel='Classes', ylabel='Number of samples', title='Binning before oversampling')

In [None]:
sns.histplot(
    data=y_train_over,
    discrete=True
).set(xlabel='Classes', ylabel='Number of samples')
plt.title('Binning after oversampling')

### Normalization and Min-Max Scaling

In [None]:
def scale_features(train_data_origin, test_data_origin, scaler, features: List[str]) -> Tuple[pd.DataFrame,pd.DataFrame]:
    train_data=train_data_origin.copy()
    test_data=test_data_origin.copy()
    for feature in features:
        feature_train = train_data[feature].to_numpy().reshape(-1, 1)
        feature_test = test_data[feature].to_numpy().reshape(-1, 1)

        scaled_train = np.squeeze(scaler.fit_transform(feature_train))
        scaled_test = np.squeeze(scaler.transform(feature_test))

        train_data[feature] = scaled_train
        test_data[feature] = scaled_test
    return train_data, test_data


def normalize_df(train_data_origin, test_data_origin, norm: str = 'l2') -> Tuple[pd.DataFrame,pd.DataFrame]:
    train_data=train_data_origin.copy()
    test_data=test_data_origin.copy()

    norm_train = normalize(train_data, norm=norm)
    norm_test = normalize(test_data, norm=norm)

    train_data = norm_train
    test_data = norm_test
    return train_data, test_data


In [None]:
scaler = MinMaxScaler()
# scaler = StandardScaler()
features = [
    'year',
    'title_length',
    'runtime',
    'rating_count',
    'tag_count'
]
X_train=train_data.loc[:, train_data.columns != target]
X_test=test_data.loc[:, test_data.columns != target]
X_train_scaled,X_test_scaled= scale_features(X_train_over,X_test,scaler,features)
X_train_norm,X_test_norm = normalize_df(X_train_scaled, X_test_scaled)

## Gaussian naive bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_norm, y_train_over).predict(X_test_norm)
print("Avg accuracy: ", gnb.score(X_test_norm, y_test))
print("Mean of the Gaussian Estimators")
print(gnb.theta_)
print("Std Dev of the Gaussian Estimators")
print(gnb.sigma_)

## Quadratic discriminant analysis

In [None]:
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
qda.fit(X_train_norm, y_train_over).predict(X_test_norm)
print("Avg accuracy: ", qda.score(X_test_norm, y_test))
print("QDA means per class")
print(qda.means_)
print("QDA covariance per class")
print(qda.covariance_)

In [None]:
cat = CategoricalNB()
cat.fit(X_train_norm, y_train_over).predict(X_test_norm)
print("Avg accuracy: ", cat.score(X_test_norm, y_test))