# Imbalance evaluation

## Imports

In [1]:
import os

import numpy as np
import pandas as pd
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import compute_class_weight

from src.utils.const import DATA_DIR, SEED
from src.utils.util_models import fix_random

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

### Repeatability

In [3]:
fix_random(SEED)

## Start to work

In [4]:
final = pd.read_parquet(os.path.join(PROCESSED_DIR, 'final.parquet'))

### Add rating_discrete feature

In [5]:
bins = 10
final = (final
         .assign(rating_discrete=pd.cut(final.loc[:, 'rating_mean'], bins=bins, labels=False))
         .astype({'rating_discrete': 'int32'})
         .drop(columns=['rating_mean']))
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13147 entries, 0 to 13146
Columns: 1153 entries, year to rating_discrete
dtypes: float32(1130), float64(1), int32(22)
memory usage: 58.0 MB


### Separate train/test

In [6]:
data = final.loc[:, final.columns != 'rating_discrete']
target = final['rating_discrete']
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2,
                                                                    stratify=final['rating_discrete'])

### Training function

In [7]:
def try_sample(model, train_data_inside, train_target_inside) -> None:
    # Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    # Evaluate model
    scoring = ['accuracy', 'precision_macro', 'recall_macro']
    scores = cross_validate(model, train_data_inside, train_target_inside, scoring=scoring, cv=cv, n_jobs=-1)
    # summarize performance
    print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
    print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
    print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

### class_weights = 'balanced'

In [8]:
model_balanced = RandomForestClassifier(criterion='entropy', class_weight='balanced')
try_sample(model_balanced, train_data, train_target)

Mean Accuracy: 0.7546
Mean Precision: 0.6551
Mean Recall: 0.5271


### class_weights = 'balanced_subsample'

In [9]:
model_balanced_subsample = RandomForestClassifier(criterion='entropy', class_weight='balanced_subsample')
try_sample(model_balanced_subsample, train_data, train_target)

Mean Accuracy: 0.7565
Mean Precision: 0.6888
Mean Recall: 0.5504


### class_weights = dict

In [10]:
class_weight = compute_class_weight('balanced', classes=np.unique(target), y=target)
class_weight_dict = dict(enumerate(class_weight))

model_dict = RandomForestClassifier(criterion='entropy', class_weight=class_weight_dict)
try_sample(model_dict, train_data, train_target)

Mean Accuracy: 0.7547
Mean Precision: 0.6517
Mean Recall: 0.5211


### Transformations

In [7]:
features = [
    'year',
    'title_length',
    'runtime',
    'rating_count',
    'tag_count'
]

scaler = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('minmax', MinMaxScaler(), features)
    ])

norm = Normalizer(norm='l2')
pipe = Pipeline(steps=[
    ('scaler', scaler),
    ('norm', norm)
])

pipe.fit(train_data)
train_data_proc = pipe.transform(train_data)
test_data_proc = pipe.transform(test_data)

### SMOTE

In [12]:
smt = SMOTE(k_neighbors=4)
train_data_smt, train_target_smt = smt.fit_resample(train_data, train_target)
model_smt = RandomForestClassifier(criterion='entropy')
try_sample(model_smt, train_data_smt, train_target_smt)

Mean Accuracy: 0.9374
Mean Precision: 0.9368
Mean Recall: 0.9374


In [13]:
train_data_smt_proc, train_target_smt_proc = smt.fit_resample(train_data_proc, train_target)
model_smt_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_proc, train_data_smt_proc, train_target_smt_proc)

Mean Accuracy: 0.9386
Mean Precision: 0.9383
Mean Recall: 0.9386


### SMOTETomek

In [18]:
smt_tom = SMOTETomek(smote=SMOTE(k_neighbors=4), tomek=TomekLinks(sampling_strategy='majority'))
train_data_smt_tom, train_target_smt_tom = smt_tom.fit_resample(train_data, train_target)
model_smt_tom = DecisionTreeClassifier(criterion='entropy', max_depth=15)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)

losses = []
scores = []

for train_idx, val_idx in cv.split(train_data_smt_tom.to_numpy(), y=train_target_smt_tom.to_numpy()):
    X_train, X_val = train_data_smt_tom[train_idx, :], train_data_smt_tom[val_idx, :]
    y_train, y_val = train_target[train_idx], train_target[val_idx]

    model_smt_tom.fit(X_train, y_train)

    predicted_target = model_smt_tom.predict(X_val)
    loss = zero_one_loss(y_val, predicted_target)
    score = model_smt_tom.score(X_val, y_val)

    losses.append(loss)
    scores.append(score)

    print(f'Loss: {loss:.3f}')
    print(f'Accuracy: {score:.3f}')

print(f'Mean losses: {np.mean(losses)}')
print(f'Mean scores: {np.mean(scores)}')

predicted_target = model_smt_tom.predict(test_data_proc)
loss = zero_one_loss(test_target, predicted_target)
score = model_smt_tom.score(test_data_proc, test_target)
print(f'Loss TEST: {loss:.3f}')
print(f'Accuracy TEST: {score:.3f}')

[[2.00300000e+03 3.60000000e+01 0.00000000e+00 ... 1.70000009e-02
  7.37499967e-02 1.89999994e-02]
 [1.99300000e+03 1.70000000e+01 1.00000000e+00 ... 5.12499996e-02
  1.27250001e-01 1.89999994e-02]
 [2.01600000e+03 3.50000000e+01 1.00000000e+00 ... 1.02500003e-02
  1.36999995e-01 1.89999994e-02]
 ...
 [1.99781982e+03 2.80000000e+01 0.00000000e+00 ... 2.46404380e-01
  9.90540758e-02 2.61553973e-02]
 [2.00494226e+03 5.00000000e+01 0.00000000e+00 ... 3.05222511e-01
  7.68955573e-02 2.03819443e-02]
 [1.96693555e+03 2.30000000e+01 0.00000000e+00 ... 5.01719862e-02
  7.62721226e-02 2.08621714e-02]]


InvalidIndexError: (array([    1,     3,     4, ..., 32197, 32198, 32199]), slice(None, None, None))

In [15]:
train_data_smt_tom_proc, train_target_smt_tom_proc = smt_tom.fit_resample(train_data_proc, train_target)
model_smt_tom_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_tom_proc, train_data_smt_tom_proc, train_target_smt_tom_proc)

Mean Accuracy: 0.9397
Mean Precision: 0.9386
Mean Recall: 0.9389


### SMOTEENN

In [None]:
smt_enn = SMOTEENN(smote=SMOTE(k_neighbors=4), enn=EditedNearestNeighbours(n_neighbors=4))
train_data_smt_enn, train_target_smt_enn = smt_enn.fit_resample(train_data, train_target)
model_smt_enn = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_enn, train_data_smt_enn, train_target_smt_enn)

In [None]:
smt_enn = SMOTEENN(smote=SMOTE(k_neighbors=4), enn=EditedNearestNeighbours(n_neighbors=4))
train_data_smt_enn_proc, train_target_smt_enn_proc = smt_enn.fit_resample(train_data_proc, train_target)
model_smt_enn_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_enn_proc, train_data_smt_enn_proc, train_target_smt_enn_proc)

### RandomOverSampler

In [None]:
rnd_over = RandomOverSampler()
train_data_rnd_over, train_target_rnd_over = rnd_over.fit_resample(train_data, train_target)
model_rnd_over = RandomForestClassifier(criterion='entropy')
try_sample(model_rnd_over, train_data_rnd_over, train_target_rnd_over)

In [None]:
rnd_over = RandomOverSampler()
train_data_rnd_over_proc, train_target_rnd_over_proc = rnd_over.fit_resample(train_data_proc, train_target)
model_rnd_over_proc = RandomForestClassifier(criterion='entropy')
try_sample(model_rnd_over_proc, train_data_rnd_over_proc, train_target_rnd_over_proc)

### SMOTE with threshold

In [11]:
bins_count = train_target.value_counts()
for i in range(len(bins_count)):
    if bins_count[i] <= 500:
        bins_count[i] = 500

bin_sizes = bins_count.to_dict()

smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new, train_target_smt_new = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new = RandomForestClassifier(criterion='entropy')
try_sample(model_smt_new, train_data_smt_new, train_target_smt_new)

Mean Accuracy: 0.8006
Mean Precision: 0.8777
Mean Recall: 0.8378


In [12]:
smt_new = SMOTE(k_neighbors=4, sampling_strategy=bin_sizes)
train_data_smt_new_proc, train_target_smt_new_proc = smt_new.fit_resample(train_data_proc, train_target)
model_smt_new_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(criterion='entropy'))
try_sample(model_smt_new_bagging, train_data_smt_new_proc, train_target_smt_new_proc)

Mean Accuracy: 0.7997
Mean Precision: 0.8808
Mean Recall: 0.8329
