In [4]:
!pip install imbalanced-learn



In [5]:
# Handling Imbalanced Dataset using SMOTE

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Artificially create imbalance
X = X[y == 0]
y = y[y == 0]

X_majority = data.data[data.target == 1][:50]
y_majority = data.target[data.target == 1][:50]

X = np.vstack((X, X_majority))
y = np.hstack((y, y_majority))

print("Before SMOTE:", Counter(y))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_resampled))

# Train model
model = LogisticRegression(max_iter=500)
model.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Before SMOTE: Counter({np.int64(0): 212, np.int64(1): 50})
After SMOTE: Counter({np.int64(0): 170, np.int64(1): 170})

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.85      1.00      0.92        11

    accuracy                           0.96        53
   macro avg       0.92      0.98      0.95        53
weighted avg       0.97      0.96      0.96        53



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
