 # Notebook 2: Handling Imbalance with SMOTE and BalancedRandomForest + HPO

## 1. Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

## 2. Load Data

In [None]:
train = pd.read_csv("imbtrain.csv", na_values=[], keep_default_na=False)
test = pd.read_csv("imbtest.csv", na_values=[], keep_default_na=False)

In [None]:
train.head(5)

In [None]:
train['Churn'].value_counts()

In [None]:
train['Churn'].value_counts(normalize=True)

In [None]:
test['Churn'].value_counts()

In [None]:
test['Churn'].value_counts(normalize=True)

## 3. Preprocessing

In [None]:
def preprocess(df):
    # TODO: copy our preprocess function from part 1 and augment it anyway you like
    return df

In [None]:
post_train = preprocess(train)
post_test = preprocess(test)

In [None]:
post_train.head(5)

In [None]:
X_train = post_train.drop('Churn', axis=1)
y_train = post_train['Churn']
X_test = post_test.drop('Churn', axis=1)
y_test = post_test['Churn']

In [None]:
X_train.head(5)

In [None]:
y_train.head(5)

#### Helper function for evaluation

In [None]:
from utils import evaluate_model

## 4.a. Random Forest

In [None]:
# TODO: Fit a Random Forest classifier over the imbalanced dataset and evaluate it
# clf = ?
evaluate_model(clf, X_test, y_test, "Minority Class Oversampling + Random Forest")

## 4.b. Minority Class Oversampling + Random Forest

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# TODO: Oversample the minority class
X_mcos, y_mcos = ?

print("After Random Oversampling:", Counter(y_mcos))

# TODO: fit RandomForest over the new dataset and evaluate it

In [1]:
# TODO: Use sampling_strategy=0.3 to more gently oversample the minority class 

## 4.c. Majority Class Undersampling + Random Forest

In [None]:
from imblearn.under_sampling import RandomUnderSampler
# TODO: Undersample the majority class
# X_mcus, y_mcus = ?
print("After Random Undersampling:", Counter(y_mcus))
# TODO: fit RandomForest over the new dataset and evaluate it

In [None]:
# TODO: Use sampling_strategy=0.3 to more gently oversample the minority class 

## 4.d. Majority + Minority Class Under/oversampling + Random Forest

In [None]:
# TODO: Oversample and then undersample
X_temp, y_temp = ?
X_combo, y_combo = ?

print("After Combined Sampling:", Counter(y_combo))
# TODO: fit RandomForest over the new dataset and evaluate it

## 4.e SMOTE + Regular Random Forest

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)
# TODO: Use SMOTE to get new X and y from training, with more fraud cases!
# documentation: https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
# X_sm, y_sm = ?
# TODO: Fit a random forest classifier over the augmented training data
# clf_smote = ?
evaluate_model(clf_smote, X_test, y_test, "SMOTE + Random Forest")

### 4.5. SMOTE + Regular Random Forest w/ HPO

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# TODO: Define a dict with hyperparameters to optimize/tune: Use n_estimators and max_depth.
# What is a good range for them? Consult the internet.
# rf_params = ?
# Use the GridSearchCV class to automatically fit and evaluate many RandomForest classifiers
# with different values for these hyperparameters, and also automatically find the best performing one.
# best performing how? Notice the `scoring` argument!
# gs_rf = ?
evaluate_model(gs_rf.best_estimator_, X_test, y_test, "SMOTE + Random Forest (HPO)")

## 5. Balanced Random Forest (no SMOTE)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
# TODO: Find a BALANCED version of the random forest classifier in the `imblearn` python package.
# Import it. Create a classifier. Fit it on the REGULAR imbalanced dataset (NOT the one you augmented with SMOTE).
# Then, evaluate it.
# from imblearn.? import ?
# clf_balrf = ?
evaluate_model(clf_balrf, X_test, y_test, "Balanced Random Forest Only")

### 5.5. Balanced Random Forest (no SMOTE) w/ HPO

In [None]:
# TODO: Use GridSearchCV to find the best hyperparameters for the balanced random forest.
# Pick 2 or 3 hyperparameters. You can use the same ones, or explore the class definition to choose others.
# brf_params = ?
# gs_brf = ?
evaluate_model(gs_brf.best_estimator_, X_test, y_test, "Balanced Random Forest Only (HPO)")

## 6. SMOTE + Balanced Random Forest

In [None]:
# TODO: Create a new balabced random forest classifier,
# and fit it on the SMOTE-augmented data. Then evaluate it.
# clf_comb = ?
evaluate_model(clf_comb, X_test, y_test, "SMOTE + Balanced Random Forest")