# Imbalance evaluation

## Imports

In [None]:
import os

import numpy as np
import pandas as pd
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import compute_class_weight

from src.utils.const import DATA_DIR, SEED
from src.utils.util_models import fix_random

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

### Repeatability

In [None]:
fix_random(SEED)

## Start to work

In [None]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

### Add rating_discrete feature

In [None]:
bins = 10
final = (final
         .assign(rating_discrete=pd.cut(final.loc[:, 'rating_mean'], bins=bins, labels=False))
         .astype({'rating_discrete': 'int32'})
         .drop(columns=['rating_mean']))
final.info()

### Separate train/test

In [None]:
data = final.loc[:, final.columns != 'rating_discrete']
target = final['rating_discrete']
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2,
                                                                    stratify=final['rating_discrete'])

### Training function

In [None]:
def try_sample(model, train_data_inside, train_target_inside) -> None:
    # Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    # Evaluate model
    scoring = ['accuracy', 'precision_macro', 'recall_macro']
    scores = cross_validate(model, train_data_inside, train_target_inside, scoring=scoring, cv=cv, n_jobs=-1)
    # summarize performance
    print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
    print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
    print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

## Transformations

In [None]:

features = [
    'year',
    'title_length',
    'runtime',
    'rating_count',
    'tag_count'
]

scaler = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('minmax', MinMaxScaler(), features)
    ])

norm = Normalizer(norm='l2')
pipe = Pipeline(steps=[
    ('scaler', scaler),
    ('norm', norm)
])

pipe.fit(train_data)
train_data_proc = pipe.transform(train_data)
test_data_proc = pipe.transform(test_data)

## Class weight balancing

### class_weights = 'balanced'

In [None]:
model_balanced = RandomForestClassifier(criterion='entropy', class_weight='balanced')
try_sample(model_balanced, train_data_proc, train_target)

### class_weights = 'balanced_subsample'

In [None]:
model_balanced_subsample = RandomForestClassifier(criterion='entropy', class_weight='balanced_subsample')
try_sample(model_balanced_subsample, train_data, train_target)

### class_weights = dict

In [None]:
class_weight = compute_class_weight('balanced', classes=np.unique(target), y=target)
class_weight_dict = dict(enumerate(class_weight))

model_dict = RandomForestClassifier(criterion='entropy', class_weight=class_weight_dict)
try_sample(model_dict, train_data, train_target)

## ImbalancedLearn Functions

## SMOTE

In [None]:
smt = SMOTE(k_neighbors=4)
train_data_smt_proc, train_target_smt_proc = smt.fit_resample(train_data_proc, train_target)
model_smt_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_proc, train_data_smt_proc, train_target_smt_proc)

## SMOTETomek

#### Random Forest Classifier

In [None]:
smt_tom = SMOTETomek(smote=SMOTE(k_neighbors=4), tomek=TomekLinks(sampling_strategy='majority'))
train_data_smt_tom, train_target_smt_tom = smt_tom.fit_resample(train_data, train_target)
model_smt_tom = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_tom, train_data_smt_tom, train_target_smt_tom)

#### DecisionTreeClassifier

In [None]:
smt_tom = SMOTETomek(smote=SMOTE(k_neighbors=4), tomek=TomekLinks(sampling_strategy='majority'))
train_data_smt_tom_proc, train_target_smt_tom_proc = smt_tom.fit_resample(train_data_proc, train_target)
model_smt_tom_proc = DecisionTreeClassifier(criterion='entropy')
try_sample(model_smt_tom_proc, train_data_smt_tom_proc, train_target_smt_tom_proc)

#### GradientBoostingClassifier

In [None]:
smt_tom = SMOTETomek(smote=SMOTE(k_neighbors=4), tomek=TomekLinks(sampling_strategy='majority'))
train_data_smt_tom_proc, train_target_smt_tom_proc = smt_tom.fit_resample(train_data_proc, train_target)
model_smt_tom_proc = GradientBoostingClassifier()
try_sample(model_smt_tom_proc, train_data_smt_tom_proc, train_target_smt_tom_proc)

## SMOTEENN

In [None]:
smt_enn = SMOTEENN(smote=SMOTE(k_neighbors=4), enn=EditedNearestNeighbours(n_neighbors=4))
train_data_smt_enn_proc, train_target_smt_enn_proc = smt_enn.fit_resample(train_data_proc, train_target)
model_smt_enn_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_enn_proc, train_data_smt_enn_proc, train_target_smt_enn_proc)

## RandomOverSampler

In [None]:
rnd_over = RandomOverSampler()
train_data_rnd_over_proc, train_target_rnd_over_proc = rnd_over.fit_resample(train_data_proc, train_target)
model_rnd_over_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_rnd_over_proc, train_data_rnd_over_proc, train_target_rnd_over_proc)

## BaggingClassifier

### Min Threshold

In [None]:
bins_count = train_target.value_counts()
for i in range(len(bins_count)):
    if bins_count[i] <= 500:
        bins_count[i] = 500

bin_sizes = bins_count.to_dict()

#### RandomForestClassifier SMOTE with threshold

In [None]:
smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new, train_target_smt_new = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_new, train_data_smt_new, train_target_smt_new)

#### BaggingClassifier - RandomForestClassifier SMOTE with threshold

In [None]:
smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new_proc, train_target_smt_new_proc = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(criterion='entropy'))
try_sample(model_smt_new_bagging, train_data_smt_new_proc, train_target_smt_new_proc)

#### BaggingClassifier - SVC SMOTE with threshold

In [None]:
smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new_proc, train_target_smt_new_proc = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new_bagging = BaggingClassifier(base_estimator=SVC())
try_sample(model_smt_new_bagging, train_data_smt_new_proc, train_target_smt_new_proc)

#### BaggingClassifier - RandomForestClassifier SMOTE

In [None]:
smt_new = SMOTE(k_neighbors=4)
train_data_smt_new_proc, train_target_smt_new_proc = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(criterion='entropy'))
try_sample(model_smt_new_bagging, train_data_smt_new_proc, train_target_smt_new_proc)