In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import EasyEnsembleClassifier



In [4]:
class GenericMethodHandler:
    def __init__(self, dataset, target_column, test_size = 0.3, imbalance_ratio = 0.2, random_state=42):
        
        self.dataset = dataset
        self.target_column = target_column
        self.test_size = test_size
        self.imbalance_ratio = imbalance_ratio
        self.random_state = random_state
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self._prepare_data()
    
    def _prepare_data(self):

        # Split into features (X) and target (y)
        x = self.dataset.drop(columns=[self.target_column])
        y = self.dataset[self.target_column]

        # Fill missing values for numerical columns
        for col in x.select_dtypes(include=["float", "int"]).columns:
            x[col] = x[col].fillna(x[col].median())

        # Fill missing values for categorical columns
        for col in x.select_dtypes(include=["object", "category"]).columns:
            x[col] = x[col].fillna(x[col].mode()[0])

        # Encode categorical variables using one-hot encoding
        x = pd.get_dummies(x, drop_first=True)

        # Split into training and test sets
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=self.test_size, random_state = self.random_state
        )

        # Introduce class imbalance in the training set
        self.x_train, self.y_train = self._introduce_imbalance(x_train, y_train)
        self.x_test, self.y_test = x_test, y_test

    def _introduce_imbalance(self, x_train, y_train):

        train_df = pd.concat([x_train, y_train], axis=1)
        majority_class = y_train.value_counts().idxmax()  # Identify the majority class
        majority_samples = train_df[train_df[self.target_column] == majority_class]
        minority_samples = train_df[train_df[self.target_column] != majority_class]

        # Reduce the majority class
        reduced_majority = majority_samples.sample(
            frac=self.imbalance_ratio, random_state = self.random_state
        )

        # Combine reduced majority and minority samples
        imbalanced_train_df = pd.concat([reduced_majority, minority_samples])
        return imbalanced_train_df.drop(columns=[self.target_column]), imbalanced_train_df[self.target_column]

    def apply_smote(self):

        smote = SMOTE(random_state = self.random_state)
        self.x_train, self.y_train = smote.fit_resample(self.x_train, self.y_train)

    def apply_adasyn(self):

        adasyn = ADASYN(random_state = self.random_state)
        self.x_train, self.y_train = adasyn.fit_resample(self.x_train, self.y_train)
    
    def apply_random_undersampling(self):

        rus = RandomUnderSampler(random_state = self.random_state)
        self.x_train, self.y_train = rus.fit_resample(self.x_train, self.y_train)

    def train_and_evaluate_generalized(self, method, max_depth = None, n_estimators = 10):

        if method == "none":
            model = RandomForestClassifier(max_depth=max_depth, random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- No Method Results ---")
        
        elif method == "smote":
            self.apply_smote()
            model = RandomForestClassifier(max_depth=max_depth, random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- Smote Method Results ---")
        
        elif method == "class_weights":
            model = RandomForestClassifier(max_depth=max_depth, class_weight="balanced", random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- Class Weights Method Results ---")

        elif method == "adasyn":
            self.apply_adasyn()
            model = RandomForestClassifier(max_depth=max_depth, random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- Adasyn Method Results ---")
        
        elif method == "random_undersampling":
            self.apply_random_undersampling()
            model = RandomForestClassifier(max_depth=max_depth, random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- Random Undersampling Method Results ---")

        elif method == "easy_ensemble":
            model = EasyEnsembleClassifier(n_estimators=n_estimators, random_state = self.random_state)
            model.fit(self.x_train, self.y_train)
            predictions = model.predict(self.x_test)
            print("\n--- Easy Ensemble Method Results ---")
        
        else:
            raise ValueError("Invalid Method")

        print("Accuracy: ", accuracy_score(self.y_test, predictions))
        print("Classification Report: \n", classification_report(self.y_test, predictions))
        print("Confusion Matrix: \n", confusion_matrix(self.y_test, predictions))

In [5]:
# Load Datasets
titanic_data = pd.read_csv(r"C:\Users\potat\OneDrive\Desktop\titanic.csv")
park_data = pd.read_csv(r"C:\Users\potat\OneDrive\Desktop\parkinsons.csv")

In [6]:
handler = GenericMethodHandler(
    dataset = titanic_data,
    target_column="Survived",
    test_size=0.3,
    imbalance_ratio=0.2
)

In [7]:
handler.train_and_evaluate_generalized(method="none")
handler.train_and_evaluate_generalized(method="smote")
handler.train_and_evaluate_generalized(method="class_weights")
handler.train_and_evaluate_generalized(method="adasyn")
handler.train_and_evaluate_generalized(method="random_undersampling")
handler.train_and_evaluate_generalized(method="easy_ensemble", n_estimators=20)


--- No Method Results ---
Accuracy:  0.6869158878504673
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.57      0.68       122
           1       0.60      0.84      0.70        92

    accuracy                           0.69       214
   macro avg       0.71      0.71      0.69       214
weighted avg       0.73      0.69      0.69       214

Confusion Matrix: 
 [[70 52]
 [15 77]]

--- Smote Method Results ---
Accuracy:  0.7523364485981309
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.70      0.76       122
           1       0.67      0.83      0.74        92

    accuracy                           0.75       214
   macro avg       0.76      0.76      0.75       214
weighted avg       0.77      0.75      0.75       214

Confusion Matrix: 
 [[85 37]
 [16 76]]

--- Class Weights Method Results ---
Accuracy:  0.7523364485981309
Classification Report: 
         