# Setup

In [44]:
!pip install -q numpy pandas imbalanced-learn

[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

# Original data

In [89]:
import numpy as np
import pandas as pd

# Generate data
data = pd.DataFrame()
seeds = [7, 42, 73, 101]
n_rows = 200
for i, seed in enumerate(seeds):
    np.random.seed(seed)
    data[f"feature_{i+1}"] = np.random.randint(0, 10, n_rows)
data["target"] = np.random.choice([0, 1, 2], size=n_rows, p=[0.15, 0.30, 0.55])

X = data.drop("target", axis=1)
y = data["target"]

# The class distribution
class_counts = y.value_counts()
print(class_counts)

target
2    112
1     61
0     27
Name: count, dtype: int64


# Oversampling

In [93]:
from imblearn.over_sampling import RandomOverSampler

# Craft the Oversampling Potion
oversampler = RandomOverSampler(random_state=42)

# Perform the spell: Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Reveal the new class distribution
resampled_class_counts = pd.Series(y_resampled).value_counts()
print(resampled_class_counts)

target
2    112
1    112
0    112
Name: count, dtype: int64


# Undersampling

In [94]:
from imblearn.under_sampling import RandomUnderSampler

# Brew the Undersampling Elixir
undersampler = RandomUnderSampler(random_state=42)

# Cast the spell: Resample the data
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Unveil the new class distribution
resampled_class_counts = pd.Series(y_resampled).value_counts()
print(resampled_class_counts)

target
0    27
1    27
2    27
Name: count, dtype: int64


# SMOTE

In [98]:
from imblearn.over_sampling import SMOTE

# Prepare the Cauldron for SMOTE
smote = SMOTE(random_state=42)

# Unleash the enchantment: Resample the data
X_resampled, y_resampled = smote.fit_resample(X, y)

# Reveal the new class distribution
resampled_class_counts = pd.Series(y_resampled).value_counts()
print(resampled_class_counts)

target
2    112
1    112
0    112
Name: count, dtype: int64


# Evaluation

In [118]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [125]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
knn = KNeighborsClassifier()

# Perform cross-validation
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring="f1_macro")

# Display the scores
print("Cross-Validation Scores:", [round(score, 3) for score in scores])
print("Mean F1-score:", round(scores.mean(), 3))

Cross-Validation Scores: [0.378, 0.25, 0.214, 0.197, 0.444]
Mean F1-score: 0.296


# Ensemble