# Sklearn models

## Imports

In [None]:
import os
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, zero_one_loss, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from src.models.config import best_param_grid_model
from src.models.sklearn_models import balance, preprocess
from src.utils.const import DATA_DIR, SEED, NUM_BINS
from src.utils.util_models import fix_random

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

### Fix random seed

In [None]:
fix_random(SEED)

## Import final dataset

In [None]:
final_stored = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))
final = (final_stored
         .assign(rating_discrete=pd.cut(final_stored.loc[:, 'rating_mean'], bins=NUM_BINS, labels=False))
         .astype({'rating_discrete': 'int32'})
         .drop(columns=['rating_mean']))

## Training

In [None]:
def train(df: pd.DataFrame, model_group: str, model_idx: int, param_grid: Dict):
    target_column = 'rating_discrete'
    data = df.loc[:, df.columns != target_column]
    target = df[target_column]

    N_SPLITS = 2

    cv_outer = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
    correct_param_grid = [param_grid[model_group][model_idx]]

    for model_name, estimator, param_grid in correct_param_grid:
        print(f'Model name: {model_name}')
        outer_results = []
        outer_f1_results = []
        for fold, (train_idx, test_idx) in enumerate(cv_outer.split(data, y=target), 1):
            print(f'Fold {fold}')
            train_data, test_data = data.iloc[train_idx, :], data.iloc[test_idx, :]
            train_target, test_target = target[train_idx], target[test_idx]

            cv_inner = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)

            train_data_smt, train_target_smt = balance(train_data, train_target)
            train_data_proc, test_data_proc = preprocess(train_data_smt, test_data)

            search = GridSearchCV(estimator=estimator,
                                  param_grid=param_grid,
                                  scoring='accuracy',
                                  cv=cv_inner,
                                  refit=True,
                                  n_jobs=-1,
                                  verbose=3)

            search.fit(train_data_proc, train_target_smt)
            best_model = search.best_estimator_
            y_pred = best_model.predict(test_data_proc)
            acc = accuracy_score(test_target, y_pred)
            loss = zero_one_loss(test_target, y_pred)
            f1_test = f1_score(test_target, y_pred, average='weighted')
            outer_results.append(acc)
            outer_f1_results.append(f1_test)
            print(f'loss={loss:3f}, acc={acc:3f} ,f1-score={f1_test:3f}, cfg={search.best_params_}')

        print(
            f'[{model_name}] [test] Mean accuracy: {np.mean(outer_results):3f} - Mean f1-score: {np.mean(outer_f1_results):3f}')

## Tree methods

### RandomForestClassifier

In [None]:
train(final, 'tree_based', 0, best_param_grid_model)

### DecisionTreeClassifier

In [None]:
train(final, 'tree_based', 1, best_param_grid_model)

## Naive Bayes methods

### GaussianNB

In [None]:
train(final, 'naive_bayes', 0, best_param_grid_model)

### QuadraticDiscriminantAnalysis

In [None]:
train(final, 'naive_bayes', 1, best_param_grid_model)

## SVM

### SVC

In [None]:
train(final, 'svm', 0, best_param_grid_model)