# Hyper parameters tuning

## Imports

In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.metrics import zero_one_loss

from src.utils.const import DATA_DIR

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Start to work

In [None]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

### Add rating_discrete feature

In [None]:
bins = 10
final = (final
         .assign(rating_discrete=pd.cut(final.loc[:, 'rating_mean'], bins=bins, labels=False))
         .astype({'rating_discrete': 'int32'})
         .drop(columns=['rating_mean']))
final.info()

### Separate train/test

In [None]:
data = final.loc[:, final.columns != 'rating_discrete']
target = final['rating_discrete']
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2,
                                                                    stratify=final['rating_discrete'])

### Prepare Pipeline

#### Scaler

In [None]:
features = [
    'year',
    'title_length',
    'runtime',
    'rating_count',
    'tag_count'
]

scaler = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('minmax', MinMaxScaler(), features)
    ])

#### Normalizer

In [None]:
norm = Normalizer(norm='l2')

#### Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('scaler', scaler),
    ('norm', norm)
])

pipe.fit(train_data)
train_data_proc = pipe.transform(train_data)
test_data_proc = pipe.transform(test_data)

In [None]:
from src.models.config import param_grid_forest

counts = np.bincount(train_target)
class_weight = dict(enumerate(1. / counts))

estimator = RandomForestClassifier(class_weight=class_weight)

search = GridSearchCV(estimator=estimator,
                      param_grid=param_grid_forest,
                      cv=5,
                      verbose=1,
                      scoring='accuracy',
                      n_jobs=-1,
                      error_score='raise')

search.fit(train_data_proc, train_target)

In [None]:
predicted_target = search.predict(test_data_proc)
print(f'Loss: {zero_one_loss(test_target, predicted_target):.3f}')

In [None]:
print(f'Score: {search.score(test_data_proc, test_target):.3f}')

kf = KFold()
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  cv=kf,
                  verbose=1,
                  scoring='accuracy')

gs.fit(train_data, train_target)
gs.best_params_
Put GridSearchCV inside Pipeline, after preprocessing

classifier = GaussianNB()
pipe = Pipeline(
    steps=[('preprocessor', preprocessor), ('classifier', classifier)]
)

pipe.fit(train_data, train_target)
score = pipe.score(test_data, test_target)
print(f'Model score: {score:.3f}')

estimator = SVC()
cv = StratifiedKFold()
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}

gs = GridSearchCV(estimator=estimator,
                  param_grid=param_grid,
                  cv=cv,
                  verbose=1,
                  scoring='precision',
                  refit=True,
                  n_jobs=-1)

gs.fit(train_data_prep, train_target)
predict_target = gs.predict(test_data_prep)

gs.best_params_

gs.best_score_