# Imbalanced Classification - Undersampling and Oversampling

## 0. Introduction

This notebook contains:
  1. Random Oversampling
  2. Random Undersampling
  3. Combined random oversampling and oversampling


## 2. Random Oversampling

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [6]:
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
model = DecisionTreeClassifier()
over = RandomOverSampler()
pipeline = Pipeline(steps=[
    ('over', over),
    ('m', model)
])
n_scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
print(f"F1 Score: {np.mean(n_scores):.2f} ({np.std(n_scores):.2f})")

F1 Score: 0.99 (0.00)


# 3. Random Undersampling

In [3]:
from imblearn.under_sampling import RandomUnderSampler

In [7]:
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
under = RandomUnderSampler()
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[
    ('u', under),
    ('m', model)
])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
n_scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
print(f"F1 Score: {np.mean(n_scores):.2f} ({np.std(n_scores):.2f})")

F1 Score: 0.94 (0.02)


## 3. Combining oversampling and undersampling

In [8]:
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
under = RandomUnderSampler(sampling_strategy=0.5)
over = RandomOverSampler(sampling_strategy=0.1)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[
    ('o', over),
    ('u', under),
    ('m', model)
])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
n_scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
print(f"F1 Score: {np.mean(n_scores):.2f} ({np.std(n_scores):.2f})")

F1 Score: 0.98 (0.01)
