# Imbalance evaluation

## Imports

In [5]:
import os

import numpy as np
import pandas as pd
from sklearn import ensemble
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RandomUnderSampler
from sklearn import tree, svm
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import compute_class_weight

from src.utils.const import DATA_DIR, SEED
from src.utils.util_models import fix_random

### Useful path to data

In [25]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

### Fix random seed

In [26]:
fix_random(SEED)

## Start to work

In [29]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

### Add rating_discrete feature

In [30]:
final = (final
         .assign(rating_discrete=pd.cut(final.loc[:, 'rating_mean'], bins=N_BINS, labels=False))
         .astype({'rating_discrete': 'int32'})
         .drop(columns=['rating_mean']))
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13147 entries, 0 to 13146
Columns: 1153 entries, year to rating_discrete
dtypes: float32(1130), float64(1), int32(22)
memory usage: 58.0 MB


### Separate train/test

In [31]:
data = final.loc[:, final.columns != 'rating_discrete']
target = final['rating_discrete']
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2,
                                                                    stratify=final['rating_discrete'])

## Models definition

In [32]:
models ={'random_forest' : ensemble.RandomForestClassifier(),
        'decision_tree' :tree.DecisionTreeClassifier(),
        'GaussianNB' : GaussianNB(),
        'quadratic_discriminant' : QuadraticDiscriminantAnalysis(store_covariance=True),
        'svm': svm.SVC()}

### Training function

In [41]:
def try_sample(train_data_inside, train_target_inside) -> None:
    # Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1)
    # Evaluate model
    scoring = ['accuracy', 'precision_macro', 'recall_macro',]
    results=[]
    for elm in models.items():
        scores = cross_validate(elm[1], train_data_inside, train_target_inside, scoring=scoring, cv=cv, n_jobs=-1)
        # summarize performance
        elemento = {'model_name':elm[0],'mean_acc': np.mean(scores['test_accuracy']),'mean_prec': np.mean(scores['test_precision_macro']),'mean_recall': np.mean(scores['test_recall_macro'])}
        results.append(elemento)
    for elm in results:
        print(f'{elm}\n\n')

## Transformations

In [37]:
features = [
    'year',
    'title_length',
    'runtime',
    'rating_count',
    'tag_count'
]

scaler = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('minmax', MinMaxScaler(), features)
    ])

norm = Normalizer(norm='l2')
pipe = Pipeline(steps=[
    ('scaler', scaler)
    #,('norm', norm) #To review after checks
])

pipe.fit(train_data)
train_data_proc = pipe.transform(train_data)
test_data_proc = pipe.transform(test_data)

## Class weight balancing

### class_weights = 'balanced'

In [None]:
model_balanced = RandomForestClassifier(criterion='entropy', class_weight='balanced')
try_sample(model_balanced, train_data_proc, train_target)

### class_weights = 'balanced_subsample'

In [None]:
model_balanced_subsample = RandomForestClassifier(criterion='entropy', class_weight='balanced_subsample')
try_sample(model_balanced_subsample, train_data, train_target)

### class_weights = dict

In [None]:
class_weight = compute_class_weight('balanced', classes=np.unique(target), y=target)
class_weight_dict = dict(enumerate(class_weight))

model_dict = RandomForestClassifier(criterion='entropy', class_weight=class_weight_dict)
try_sample(model_dict, train_data, train_target)

### balancer MLP

In [None]:
def balancer(train_target_tmp: np.ndarray) -> utils.data.WeightedRandomSampler:
    counts = np.bincount(train_target_tmp)
    if counts.any(0):
        np.seterr(divide='ignore')
        labels_weights = 1. / counts
        labels_weights[np.isinf(labels_weights)] = 0
    else:
        np.seterr(divide=None)
        labels_weights = 1. / counts
    weights = torch.tensor(labels_weights[train_target_tmp], dtype=torch.float)
    return utils.data.WeightedRandomSampler(weights, len(weights), replacement=True)

## ImbalancedLearn Functions

## SMOTE

In [43]:
smt = SMOTE(k_neighbors=4)
train_data_smt_proc, train_target_smt_proc = smt.fit_resample(train_data_proc, train_target)
try_sample(train_data_smt_proc, train_target_smt_proc)

{'model_name': 'random_forest', 'mean_acc': 0.929472049689441, 'mean_prec': 0.9291139192950977, 'mean_recall': 0.9294720496894411}


{'model_name': 'decision_tree', 'mean_acc': 0.8504037267080746, 'mean_prec': 0.849383191268096, 'mean_recall': 0.8504037267080745}


{'model_name': 'GaussianNB', 'mean_acc': 0.7059316770186336, 'mean_prec': 0.6932556396701524, 'mean_recall': 0.7059316770186335}


{'model_name': 'quadratic_discriminant', 'mean_acc': 0.8106521739130435, 'mean_prec': 0.8824123760581525, 'mean_recall': 0.8106521739130435}


{'model_name': 'svm', 'mean_acc': 0.9350621118012422, 'mean_prec': 0.9345567817585538, 'mean_recall': 0.9350621118012422}




## SMOTETomek

In [42]:
smt_tom = SMOTETomek(smote=SMOTE(k_neighbors=4), tomek=TomekLinks(sampling_strategy='majority'))
train_data_smt_tom, train_target_smt_tom = smt_tom.fit_resample(train_data, train_target)
try_sample(train_data_smt_tom, train_target_smt_tom)

{'model_name': 'random_forest', 'mean_acc': 0.9231055900621119, 'mean_prec': 0.9226595657841709, 'mean_recall': 0.9231055900621118}


{'model_name': 'decision_tree', 'mean_acc': 0.8436335403726708, 'mean_prec': 0.8419859963589995, 'mean_recall': 0.8436335403726707}


{'model_name': 'GaussianNB', 'mean_acc': 0.4427329192546584, 'mean_prec': 0.44526048335660484, 'mean_recall': 0.4427329192546584}


{'model_name': 'quadratic_discriminant', 'mean_acc': 0.7087577639751552, 'mean_prec': 0.86666902976784, 'mean_recall': 0.7087577639751552}


{'model_name': 'svm', 'mean_acc': 0.2043167701863354, 'mean_prec': 0.13257023267521867, 'mean_recall': 0.20431677018633543}




## SMOTEENN

In [44]:
smt_enn = SMOTEENN(smote=SMOTE(k_neighbors=4), enn=EditedNearestNeighbours(n_neighbors=4))
train_data_smt_enn_proc, train_target_smt_enn_proc = smt_enn.fit_resample(train_data_proc, train_target)
try_sample(train_data_smt_enn_proc, train_target_smt_enn_proc)

{'model_name': 'random_forest', 'mean_acc': 0.9798646362098139, 'mean_prec': 0.9713143688225052, 'mean_recall': 0.8325466781342898}


{'model_name': 'decision_tree', 'mean_acc': 0.9362098138747885, 'mean_prec': 0.8262500781483462, 'mean_recall': 0.8241686004483887}


{'model_name': 'GaussianNB', 'mean_acc': 0.8315566835871404, 'mean_prec': 0.7137148685105816, 'mean_recall': 0.7390268698368387}


{'model_name': 'quadratic_discriminant', 'mean_acc': 0.8939932318104906, 'mean_prec': 0.8332115122554947, 'mean_recall': 0.812517521334533}


{'model_name': 'svm', 'mean_acc': 0.9818950930626058, 'mean_prec': 0.9702927664318479, 'mean_recall': 0.8700267609428209}




## RandomOverSampler

In [45]:
rnd_over = RandomOverSampler()
train_data_rnd_over_proc, train_target_rnd_over_proc = rnd_over.fit_resample(train_data_proc, train_target)
try_sample(train_data_rnd_over_proc, train_target_rnd_over_proc)

{'model_name': 'random_forest', 'mean_acc': 0.9304347826086956, 'mean_prec': 0.9301174654319397, 'mean_recall': 0.9304347826086956}


{'model_name': 'decision_tree', 'mean_acc': 0.8923291925465839, 'mean_prec': 0.8900562132272851, 'mean_recall': 0.8923291925465837}


{'model_name': 'GaussianNB', 'mean_acc': 0.6799378881987577, 'mean_prec': 0.6670656265046275, 'mean_recall': 0.6799378881987577}


{'model_name': 'quadratic_discriminant', 'mean_acc': 0.5304658385093168, 'mean_prec': 0.743573680288558, 'mean_recall': 0.5304658385093168}


{'model_name': 'svm', 'mean_acc': 0.9257453416149068, 'mean_prec': 0.925051327542471, 'mean_recall': 0.9257453416149068}




## BaggingClassifier

### Min Threshold

In [46]:
bins_count = train_target.value_counts()
for i in range(len(bins_count)):
    if bins_count[i] <= 500:
        bins_count[i] = 500

bin_sizes = bins_count.to_dict()

#### RandomForestClassifier SMOTE with threshold

In [47]:
smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new, train_target_smt_new = smt_new.fit_resample(train_data_proc, train_target)
try_sample(train_data_smt_new, train_target_smt_new)

{'model_name': 'random_forest', 'mean_acc': 0.7828694817658349, 'mean_prec': 0.8650451742963301, 'mean_recall': 0.818016533614155}


{'model_name': 'decision_tree', 'mean_acc': 0.6624280230326296, 'mean_prec': 0.7226934054213032, 'mean_recall': 0.7241802768104111}


{'model_name': 'GaussianNB', 'mean_acc': 0.5228726807421625, 'mean_prec': 0.6017537753876085, 'mean_recall': 0.6591913106246556}


{'model_name': 'quadratic_discriminant', 'mean_acc': 0.4526551503518874, 'mean_prec': 0.5585288466013207, 'mean_recall': 0.5297962597672432}


{'model_name': 'svm', 'mean_acc': 0.8106206014075495, 'mean_prec': 0.8570399215210651, 'mean_recall': 0.8333812165283485}


