In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/after-cwgan/taiwan_balanced.csv
/kaggle/input/after-cwgan/HCDR_test_cwgan.csv
/kaggle/input/after-cwgan/HCDR_train_cwgan.csv
/kaggle/input/after-cwgan/taiwan_test.csv
/kaggle/input/after-cwgan/hmeq_balanced.csv
/kaggle/input/after-cwgan/german_test.csv
/kaggle/input/after-cwgan/german_balanced.csv
/kaggle/input/after-cwgan/hmeq_test.csv
/kaggle/input/cwgan/pytorch/default/4/helpers.py
/kaggle/input/cwgan/pytorch/default/4/models.py
/kaggle/input/cwgan/pytorch/default/4/dataloader.py
/kaggle/input/dataset/FinalDatasetHomeCreditDefaultRisk.csv
/kaggle/input/dataset/HCDR_test_cwgan.csv
/kaggle/input/dataset/cleaned_hmeq.csv
/kaggle/input/dataset/UCI_Credit_Card.csv
/kaggle/input/dataset/HCDR_train_cwgan.csv
/kaggle/input/dataset/german.csv


We choose these models:\
RandomForestClassifier \
XGBoost\
CatBoost\
LightGBM\
StackingClassifier\

SMOTE (oversampling)\
ClusterCentroid (undersampling)\
SMOTEENN (hybrid: OS and US)\
cWGAN (NN)\

Balanced Random Forest (ensemble)\
Class Weights \ass Weights 

In [2]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    make_scorer
)
from sklearn.model_selection import cross_validate, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns


class ModelEvaluatorWithLibrary:
    def __init__(self, models, threshold=0.5, cv_splits=5):
        self.models = models
        self.threshold = threshold
        self.cv_splits = cv_splits

    def evaluate_model_with_cv(self, X, y, model, scoring_metrics):
        """
        Perform cross-validation using the provided metrics.
        """
        cv = StratifiedKFold(n_splits=self.cv_splits, shuffle=True, random_state=42)
        cv_results = cross_validate(
            model, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False
        )
        
        # Mean scores for each metric
        model_scores = {metric: cv_results[f'test_{metric}'].mean() for metric in scoring_metrics}
        return model_scores

    def evaluate_model_on_test(self, X_train, y_train, X_test, y_test, model):
        """
        Train the model and evaluate it on the test set.
        """
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X_test)
        y_pred = (y_prob >= self.threshold).astype(int)

        scores = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred, zero_division=0),
            'auc_roc': roc_auc_score(y_test, y_prob),
        }

        return scores, y_pred

    def score_models(self, X_train, y_train, X_test, y_test):
        """
        Evaluate all models on cross-validation and the test set.
        """
        scoring_metrics = {
            'f1': 'f1',
            'accuracy': 'accuracy',
            'auc_roc': 'roc_auc'
        }

        scores = {}
        
        for name, model in self.models.items():
            print(f"Evaluating model: {name}")
            
            # Cross-validation scores
            cv_scores = self.evaluate_model_with_cv(X_train, y_train, model, scoring_metrics)
            
            # Test set evaluation
            test_scores, y_pred = self.evaluate_model_on_test(X_train, y_train, X_test, y_test, model)
            
            # Store both cross-validation and test set results
            scores[name] = {
                #'cross_val': cv_scores,
                'test': test_scores
            }
            
            # Plot confusion matrix for the current model
            #print(f"Confusion Matrix for {name}:")
            #self.plot_confusion_matrix(y_test, y_pred, labels=[0, 1])

        return scores

    def plot_confusion_matrix(self, y_test, y_pred, labels):
        """
        Plot the confusion matrix.
        """
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()

In [3]:
RANDOM_STATE = 42
threshold = 0.5

In [4]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'XGBClassifier': XGBClassifier(random_state=RANDOM_STATE),
    'CatBoostClassifier': CatBoostClassifier(verbose=False, random_state=RANDOM_STATE),
    'LightGBMClassifier': LGBMClassifier(
        random_state=RANDOM_STATE,
        verbose=-1
    ),
    'StackingClassifier': StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE)),
            ('catboost', CatBoostClassifier(verbose=False, random_state=RANDOM_STATE)),
            ('lgbm', LGBMClassifier(
                random_state=RANDOM_STATE,
                verbose=-1
            ))
        ],
        final_estimator=LogisticRegression(),
        cv=5
    )
}

# Initialize evaluator
evaluator = ModelEvaluatorWithLibrary(models=models)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

paths = ['/kaggle/input/after-cwgan/german_balanced.csv', '/kaggle/input/after-cwgan/hmeq_balanced.csv', '/kaggle/input/after-cwgan/taiwan_balanced.csv', '/kaggle/input/after-cwgan/HCDR_train_cwgan.csv']
#'/kaggle/input/dataset/german.csv', '/kaggle/input/dataset/cleaned_hmeq.csv', '/kaggle/input/dataset/UCI_Credit_Card.csv']#, '/kaggle/input/dataset/FinalDatasetHomeCreditDefaultRisk.csv']
balmethods = ['no']#, 'smote', 'cc', 'smoteenn', 'smotetomek']
#pathtest = 'a'
for path in paths:
    for balmethod in balmethods:
        print('-'*100)
        print('-'*100)
        print(f'DATASET: {path}')
        print(f'BALANCING METHOD: {balmethod}')
        print('')

        if path == '/kaggle/input/after-cwgan/german_balanced.csv':
            pathtest = '/kaggle/input/after-cwgan/german_test.csv'
        elif path == '/kaggle/input/after-cwgan/hmeq_balanced.csv':
            pathtest = '/kaggle/input/after-cwgan/hmeq_test.csv'
        elif path == '/kaggle/input/after-cwgan/taiwan_balanced.csv':
            pathtest = '/kaggle/input/after-cwgan/taiwan_test.csv'
        elif path == '/kaggle/input/after-cwgan/HCDR_train_cwgan.csv':
            pathtest = '/kaggle/input/after-cwgan/HCDR_test_cwgan.csv'
            
        train_df = pd.read_csv(path)
        test_df = pd.read_csv(pathtest)
        
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024, stratify=y)
    
        X_train = train_df.drop('target', axis = 1)
        X_test = test_df.drop('target', axis = 1)
        y_train = train_df['target']
        y_test = test_df['target']
        
        scores = evaluator.score_models(X_train, y_train, X_test, y_test)
        print(scores)

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
DATASET: /kaggle/input/after-cwgan/german_balanced.csv
BALANCING METHOD: no

Evaluating model: RandomForestClassifier
Evaluating model: XGBClassifier
Evaluating model: CatBoostClassifier
Evaluating model: LightGBMClassifier
Evaluating model: StackingClassifier
{'RandomForestClassifier': {'test': {'accuracy': 0.74, 'f1': 0.43478260869565216, 'auc_roc': 0.764702380952381}}, 'XGBClassifier': {'test': {'accuracy': 0.78, 'f1': 0.5510204081632654, 'auc_roc': 0.7578571428571429}}, 'CatBoostClassifier': {'test': {'accuracy': 0.76, 'f1': 0.5294117647058824, 'auc_roc': 0.7773809523809524}}, 'LightGBMClassifier': {'test': {'accuracy': 0.77, 'f1': 0.5490196078431373, 'auc_roc': 0.7682142857142857}}, 'StackingClassifier': {'test': {'accuracy': 0.75, 'f1': 0.5283018867924527, 'auc_roc': 0.779880952380