In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from rfoversample import RFOversampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder




import sys
sys.path.append("SupportFunctions")

from prepare_datasets import DatasetPreprocessor  

In [2]:
from sklearn.utils import resample

def introduce_imbalance(x, y, imbalance_ratio=0.2, random_state=42):
    """
    Downsample all non-majority classes to match the given imbalance ratio.

    Args:
        x (pd.DataFrame): Features.
        y (pd.Series): Labels.
        imbalance_ratio (float): Desired ratio of each minority class to the majority class.
        random_state (int): Random seed.

    Returns:
        Tuple[pd.DataFrame, pd.Series]: The imbalanced x and y.
    """
    # Combine into one DataFrame
    df = pd.concat([x, y], axis=1)
    label_col = y.name

    # Find majority class and its count
    value_counts = y.value_counts()
    maj_label = value_counts.idxmax()
    maj_count = value_counts.max()

    frames = []

    for label, count in value_counts.items():
        class_df = df[df[label_col] == label]

        if label == maj_label:
            frames.append(class_df)  # Keep majority class as-is
        else:
            n_samples = max(1, int(imbalance_ratio * maj_count))
            sampled_df = resample(class_df, replace=True, n_samples=n_samples, random_state=random_state)
            frames.append(sampled_df)

    # Combine and shuffle
    result_df = pd.concat(frames).sample(frac=1, random_state=random_state).reset_index(drop=True)
    x_new = result_df.drop(columns=label_col)
    y_new = result_df[label_col]

    # print(f"Original class distribution: {value_counts.to_dict()}")
    # print(f"Imbalanced class distribution: {y_new.value_counts().to_dict()}")

    return x_new, y_new


In [3]:
def Compare_RF_F1scores(OG_x_train, OG_y_train, RF_x_train_upsampled, RF_y_train_upsampled, SM_x_train_upsampled, SM_y_train_upsampled, x_test, y_test):

    #train, fit, and predict original points
    original_RF = RandomForestClassifier(class_weight='balanced')
    original_RF.fit(OG_x_train, OG_y_train)
    y_pred_original = original_RF.predict(x_test)

    #train, fit, and predict points upsampled by Random Forest Upsampler
    RF_upsampled_RF = RandomForestClassifier()
    RF_upsampled_RF.fit(RF_x_train_upsampled, RF_y_train_upsampled)
    y_pred_RF_upsampled = RF_upsampled_RF.predict(x_test)

    #train, fit, and predict points upsampled by SMOTE
    SM_upsampled_RF = RandomForestClassifier()
    SM_upsampled_RF.fit(SM_x_train_upsampled, SM_y_train_upsampled)
    y_pred_SM_upsampled = SM_upsampled_RF.predict(x_test)

    #append f1 scores
    scores_OG = f1_score(y_test, y_pred_original, average='binary')
    scores_RF_upsampled = f1_score(y_test, y_pred_RF_upsampled, average='binary')
    scores_SM_upsampled = f1_score(y_test, y_pred_SM_upsampled, average='binary')

    return scores_OG, scores_RF_upsampled, scores_SM_upsampled

    

In [14]:
def run(n, data, dataset_name, target, ratio, categorical=False, encoded=False, cat_col=None):
    OG_scores = np.zeros(n)
    RF_upsampled_scores = np.zeros(n)
    SM_upsampled_scores = np.zeros(n)
    for i in range(n):
        preprocessor = DatasetPreprocessor(data, target_column=target)
        x_train, y_train, x_test, y_test = (preprocessor.x_train, preprocessor.y_train,
                                                    preprocessor.x_test, preprocessor.y_test)

        x_train_imbal, y_train_imbal = introduce_imbalance(x_train, y_train, imbalance_ratio=ratio)

        Oversampler = RFOversampler(x_train=x_train_imbal, y_train=y_train_imbal, contains_categoricals=categorical, encoded=encoded, cat_cols=cat_col)
        RF_upsampled_x_train, RF_upsampled_y_train = Oversampler.fit()

        smote = SMOTE(random_state=42)
        SM_upsampled_x_train, SM_upsampled_y_train = smote.fit_resample(x_train_imbal, y_train_imbal)
    
        OG_score, RF_upsampled_score, SM_upsampled_score = Compare_RF_F1scores( 
        x_train_imbal, 
        y_train_imbal, 
        RF_upsampled_x_train, 
        RF_upsampled_y_train, 
        SM_upsampled_x_train, 
        SM_upsampled_y_train, 
        x_test, 
        y_test
        )
        OG_scores[i] = OG_score
        RF_upsampled_scores[i] = RF_upsampled_score
        SM_upsampled_scores[i] = SM_upsampled_score
    
    print(f"No Upsampling avg f1 score: {np.mean(OG_scores)}")
    print(f"RF avg f1 score: {np.mean(RF_upsampled_scores)}")
    print(f"SMOTE avg f1 score {np.mean(SM_upsampled_scores)}")
    print(np.mean(RF_upsampled_scores) - np.mean(SM_upsampled_scores))
    # fig, ax = plt.subplots(1,3, figsize=(10,5), sharey=True)

    # ax[0].plot(range(1,n+1), OG_scores)
    # ax[0].set(title='Original Data')
    # ax[1].plot(range(1,n+1), RF_upsampled_scores)
    # ax[1].set(title='RF upsampled Data')
    # ax[2].plot(range(1,n+1), SM_upsampled_scores)
    # ax[2].set(title='SMOTE upsampled Data')
    # fig.suptitle(f'F1 Scores - {dataset_name} ({ratio} imbalance ratio)')

    

In [15]:
artificial_tree_data = pd.read_csv("./datasets/artificial_tree.csv")
glass_data = pd.read_csv("./datasets/glass.csv")
opt_digits_data = pd.read_csv("./datasets/optdigits.csv")
diabetes_data = pd.read_csv("./datasets/diabetes.csv")
hill_valley_data = pd.read_csv("./datasets/hill_valley.csv")
titanic_data = pd.read_csv("./datasets/titanic.csv")
flare1_data = pd.read_csv("./datasets/flare1.csv")
heart_failure_data = pd.read_csv('./datasets/heart_failure.csv')
hepatitis_data = pd.read_csv('./datasets/hepatitis.csv')
ionsphere_data = pd.read_csv('./datasets/ionosphere.csv')
le = LabelEncoder()
ionsphere_data['class'] = le.fit_transform(ionsphere_data['class'])
parkinsons_data = pd.read_csv('./datasets/parkinsons.csv')
seeds_data = pd.read_csv('./datasets/seeds.csv')
sonar_data = pd.read_csv('./datasets/sonar.csv')
le1 = LabelEncoder()
sonar_data['class'] = le1.fit_transform(sonar_data['class'])


ARTIFICIAL TREE TEST COMPARISON

In [31]:
run(1, artificial_tree_data, "Artificial Tree", "Class", 0.2, categorical=False, encoded=False, cat_col=None)

ValueError: Found input variables with inconsistent numbers of samples: [2593, 3537]

DIABETES EXAMPLE

In [30]:
run(20, diabetes_data, "diabetes", "Outcome", 0.1, categorical=False, encoded=False, cat_col=None)

No Upsampling avg f1 score: 0.08349556745086009
RF avg f1 score: 0.43356930897558615
SMOTE avg f1 score 0.41300113385352877
0.020568175122057386


In [8]:
colsss = [str(i) for i in range(2, 14)]
# run(20, hepatitis_data, "hepatitis", "class", 0.1, categorical=True, encoded=False, cat_col=colsss)

In [9]:
run(20, ionsphere_data, "ionosphere", "class", 0.1, categorical=True, encoded=False, cat_col=['V1'])

0.01955695336978802


In [10]:
run(20, heart_failure_data, "heart failure", "DEATH_EVENT", 0.1, categorical=True, encoded=False, cat_col=['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'])

-0.002796784175806033


HILL VALLEY EXAMPLE

In [11]:
run(20, hill_valley_data, "Hill Valley", "class", 0.1, categorical=False, encoded=False, cat_col=None)

0.022452152866913655


TITANIC EXAMPLE

In [28]:
encoded_titanic = pd.get_dummies(titanic_data, columns=['Pclass', 'Sex', 'Embarked'], dtype=int)
run(10, encoded_titanic, "Titanic", "Survived", 0.3, categorical=True, encoded=True, cat_col=['Pclass', 'Sex', 'Embarked'])

No Upsampling avg f1 score: 0.635473868803315
RF avg f1 score: 0.65859129513471
SMOTE avg f1 score 0.6532718730963325
0.0053194220383775415


In [26]:
run(10, titanic_data, "Titanic", "Survived", 0.3, categorical=True, encoded=False, cat_col=['Pclass', 'Sex', 'Embarked'])

No Upsampling avg f1 score: 0.6222338080206757
RF avg f1 score: 0.661025180016739
SMOTE avg f1 score 0.66568965679755
-0.004664476780810944


In [29]:
run(10, sonar_data, "sonar", "class", 0.3, categorical=False, encoded=False, cat_col=None)

No Upsampling avg f1 score: 0.5393998857156752
RF avg f1 score: 0.6623434572612111
SMOTE avg f1 score 0.6831210001317771
-0.02077754287056599
