In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression

# Create an imbalanced classification dataset
X, y = make_classification(
    n_samples=500, n_features=10, n_classes=2,
    weights=[0.85, 0.15],  # imbalanced
    random_state=42
)

# ---- Stratified Sampling ----
# Keeps class distribution similar in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,  # ensures same class proportions
    random_state=42
)

print("Class distribution in full dataset:", np.bincount(y))
print("Class distribution in training set:", np.bincount(y_train))
print("Class distribution in test set:", np.bincount(y_test))

# ---- k-Fold Cross-Validation ----
print("\n--- Regular K-Fold Cross-Validation ---")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')
print("Accuracy scores:", scores)
print("Mean accuracy:", scores.mean())

# ---- Stratified k-Fold Cross-Validation ----
print("\n--- Stratified K-Fold Cross-Validation ---")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
strat_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
print("Accuracy scores:", strat_scores)
print("Mean accuracy:", strat_scores.mean())

Class distribution in full dataset: [423  77]
Class distribution in training set: [338  62]
Class distribution in test set: [85 15]

--- Regular K-Fold Cross-Validation ---
Accuracy scores: [0.925  0.8875 0.9    0.9375 0.975 ]
Mean accuracy: 0.925

--- Stratified K-Fold Cross-Validation ---
Accuracy scores: [0.9125 0.9375 0.95   0.9125 0.9125]
Mean accuracy: 0.925
