In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
import joblib
import random
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, KMeansSMOTE, SVMSMOTE, BorderlineSMOTE
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import matthews_corrcoef, average_precision_score, roc_auc_score
warnings.filterwarnings('ignore')

In [2]:
# Load training and testing data
X_train, y_train = joblib.load('train_data.pkl')
X_test, y_test = joblib.load('test_data.pkl')

In [3]:
def apply_sampling(X_train, y_train, method, strategy):
    # Define different resampling methods
    samplers = {
        'ROS': RandomOverSampler(sampling_strategy=strategy, random_state=42),
        'SMOTE': SMOTE(sampling_strategy=strategy, random_state=42),
        'ADASYN': ADASYN(sampling_strategy=strategy, random_state=42),
        'KMeansSMOTE': KMeansSMOTE(sampling_strategy=strategy, random_state=42, cluster_balance_threshold=-0.5),
        'SVMSMOTE': SVMSMOTE(sampling_strategy=strategy, random_state=42),
        'BorderlineSMOTE': BorderlineSMOTE(sampling_strategy=strategy, random_state=42),
        'SMOTEENN': SMOTEENN(sampling_strategy=strategy, random_state=42),
    }

    if method == 'IWGMM':
        # Implement the Inversely Weighted Gaussian Mixture Model (IWGMM) for resampling
        n_components = 6
        random_state = 42
        gmm = GaussianMixture(n_components=n_components, random_state=random_state)
        gmm.fit(X_train[y_train == 1])

        weights = gmm.weights_
        inverse_weights = 1 / weights
        inverse_weights /= np.sum(inverse_weights)
        gmm.weights_ = inverse_weights

        num_positive_samples_original = np.sum(y_train == 1)
        num_negative_samples_original = np.sum(y_train == 0)

        desired_positive_samples = int(num_negative_samples_original * strategy)

        num_new_samples = max(desired_positive_samples - num_positive_samples_original, 0)
        np.random.seed(random_state)
        new_positive_samples = gmm.sample(num_new_samples)[0]

        X_positive_resampled = np.vstack((X_train[y_train == 1], new_positive_samples))
        y_positive_resampled = np.ones(X_positive_resampled.shape[0])
        X_resampled = np.vstack((X_train[y_train == 0], X_positive_resampled))
        y_resampled = np.hstack((y_train[y_train == 0], y_positive_resampled))

        X_resampled, y_resampled = shuffle(X_resampled, y_resampled, random_state=random_state)
        return X_resampled, y_resampled

    elif method in samplers:
        # Use predefined samplers for resampling
        sampler = samplers[method]
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        X_resampled, y_resampled = shuffle(X_resampled, y_resampled, random_state=42)
        return X_resampled, y_resampled
    else:
        raise ValueError("Unsupported sampling method")

def evaluate_model_with_sampling(X_train, y_train, X_test, y_test, model_name, sampling_method):
    models = {
        'AdaBoost': AdaBoostClassifier(random_state=1),
        'RF': RandomForestClassifier(random_state=1),
        'GBDT': GradientBoostingClassifier(random_state=1),
        'XGBoost': xgboost.XGBClassifier(random_state=1),
        'CatBoost': CatBoostClassifier(random_state=1, verbose=0),
        'LightGBM': LGBMClassifier(random_state=1)
    }

    clf = models[model_name]
    results = []

    # Define a list of sampling strategies to evaluate
    # This list spans from 0.05 to 1.01, incremented by 0.01, ensuring the inclusion of 1.00.
    # It allows for a comprehensive assessment across the entire range of minority-to-majority ratios,
    # from very low to perfect balance, to examine the full impact of resampling on model performance.
    strategy_list = np.arange(0.05, 1.01, 0.01)
    
    for strategy in strategy_list:
        try:
            X_resampled, y_resampled = apply_sampling(X_train, y_train, sampling_method, strategy)
            
            clf.fit(X_resampled, y_resampled)
            p_predict = clf.predict(X_test)
            p_predict_proba = clf.predict_proba(X_test)

            result = {
                'resampling': sampling_method,
                'mmr': strategy,
                'eml': model_name,
                'gmean': geometric_mean_score(y_test, p_predict),
                'mcc': matthews_corrcoef(y_test, p_predict),
                'ap': average_precision_score(y_test, p_predict_proba[:, 1]),
                'auc': roc_auc_score(y_test, p_predict_proba[:, 1]),
             }
            results.append(result)

        except ValueError as e:
            print(f"Unable to perform resampling with strategy {strategy}: {e}")
            continue

    return pd.DataFrame(results)

In [4]:
# Perform model evaluation of all sampling methods
sampling_methods = ['ROS', 'SMOTE', 'ADASYN', 'KMeansSMOTE', 'SVMSMOTE', 'BorderlineSMOTE', 'SMOTEENN', 'IWGMM']
models = ['AdaBoost', 'RF', 'GBDT', 'XGBoost', 'CatBoost', 'LightGBM']

all_results = []

for sampling_method in sampling_methods:
    for model in models:
        evaluation_scores = evaluate_model_with_sampling(X_train, y_train, X_test, y_test, model, sampling_method)
        all_results.append(evaluation_scores)

# Combine all results into one DataFrame and save as CSV file
df_all_results = pd.concat(all_results)
df_all_results.to_csv('all_evaluations.csv', index=False)