In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cwgan/pytorch/default/1/cWGAN-based oversampling tutorial.ipynb
/kaggle/input/cwgan/pytorch/default/1/helpers.py
/kaggle/input/cwgan/pytorch/default/1/models.py
/kaggle/input/cwgan/pytorch/default/1/dataloader.py
/kaggle/input/dataset/FinalDatasetHomeCreditDefaultRisk.csv
/kaggle/input/dataset/cleaned_hmeq.csv
/kaggle/input/dataset/UCI_Credit_Card.csv
/kaggle/input/dataset/german.csv
/kaggle/input/cwganagain/pytorch/default/1/helpers.py
/kaggle/input/cwganagain/pytorch/default/1/models.py
/kaggle/input/cwganagain/pytorch/default/1/dataloader.py


In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline

# Create a pipeline using make_pipeline from imblearn
smote_tomek_pipeline = make_pipeline(
    SMOTE(),       # Apply SMOTE to oversample the minority class
    #TomekLinks()   # Apply Tomek Links to clean the data
)

In [3]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    make_scorer
)
from sklearn.model_selection import cross_validate, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns


class ModelEvaluatorWithLibrary:
    def __init__(self, models, threshold=0.5, cv_splits=5):
        self.models = models
        self.threshold = threshold
        self.cv_splits = cv_splits

    def evaluate_model_with_cv(self, X, y, model, scoring_metrics):
        """
        Perform cross-validation using the provided metrics.
        """
        cv = StratifiedKFold(n_splits=self.cv_splits, shuffle=True, random_state=42)
        cv_results = cross_validate(
            model, X, y, cv=cv, scoring=scoring_metrics, return_train_score=False
        )
        
        # Mean scores for each metric
        model_scores = {metric: cv_results[f'test_{metric}'].mean() for metric in scoring_metrics}
        return model_scores

    def evaluate_model_on_test(self, X_train, y_train, X_test, y_test, model):
        """
        Train the model and evaluate it on the test set.
        """
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X_test)
        y_pred = (y_prob >= self.threshold).astype(int)

        scores = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred, zero_division=0),
            'auc_roc': roc_auc_score(y_test, y_prob),
        }

        return scores, y_pred

    def score_models(self, X_train, y_train, X_test, y_test):
        """
        Evaluate all models on cross-validation and the test set.
        """
        scoring_metrics = {
            'f1': 'f1',
            'accuracy': 'accuracy',
            'auc_roc': 'roc_auc'
        }

        scores = {}
        
        for name, model in self.models.items():
            print(f"Evaluating model: {name}")
            
            # Cross-validation scores
            cv_scores = self.evaluate_model_with_cv(X_train, y_train, model, scoring_metrics)
            
            # Test set evaluation
            test_scores, y_pred = self.evaluate_model_on_test(X_train, y_train, X_test, y_test, model)
            
            # Store both cross-validation and test set results
            scores[name] = {
                #'cross_val': cv_scores,
                'test': test_scores
            }
            
            # Plot confusion matrix for the current model
            #print(f"Confusion Matrix for {name}:")
            #self.plot_confusion_matrix(y_test, y_pred, labels=[0, 1])

        return scores

    def plot_confusion_matrix(self, y_test, y_pred, labels):
        """
        Plot the confusion matrix.
        """
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()


In [4]:
RANDOM_STATE = 42
threshold = 0.5

In [5]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.ensemble import (
    RandomForestClassifier,
    StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

paths = ['/kaggle/input/dataset/german.csv', '/kaggle/input/dataset/cleaned_hmeq.csv', '/kaggle/input/dataset/UCI_Credit_Card.csv', '/kaggle/input/dataset/FinalDatasetHomeCreditDefaultRisk.csv']

for path in paths:
    print('-'*100)
    print('-'*100)
    print(f'DATASET: {path}')
    print('')
    df = pd.read_csv(path)

    df = df.replace([np.inf, -np.inf], 0)
    
    if path == '/kaggle/input/dataset/german.csv':
        target_col = 'Status_loan'
        #df = df.drop('')
    elif path == '/kaggle/input/dataset/cleaned_hmeq.csv':
        target_col = 'BAD'
        #df = df.drop('')
    elif path == '/kaggle/input/dataset/UCI_Credit_Card.csv':
        target_col = 'default.payment.next.month'
        df = df.drop('ID', axis=1)
    elif path == '/kaggle/input/dataset/FinalDatasetHomeCreditDefaultRisk.csv':
        target_col = 'TARGET'
        df = df.drop('SK_ID_CURR', axis=1)
        y = df[target_col]
        df, temp = train_test_split(df, test_size=0.6, random_state=2024, stratify=y) #thuc hanh tren bo nho 100000 (/200000) samples
        del temp, y

    # Define columns dynamically based on data types
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # Remove target column from cat_cols or num_cols if present
    if target_col in cat_cols:
        cat_cols.remove(target_col)
    elif target_col in num_cols:
        num_cols.remove(target_col)
    for col in cat_cols:
        df[col] = df[col].astype(str)
    
    X = df.loc[:, num_cols + cat_cols]
    y = df[target_col]
    
    print(target_col)
    print(y[y==0].count())
    print(y[y==1].count())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024, stratify=y)
    # Define numerical and categorical column transformers
    num_prep = make_pipeline(SimpleImputer(strategy='mean'),
                             MinMaxScaler())
    
    # Using OrdinalEncoder for numerical-like encoding of categorical columns
    cat_prep = make_pipeline(SimpleImputer(strategy='most_frequent'),
                             OneHotEncoder())
    
    # Combine both transformers into a ColumnTransformer
    prep = ColumnTransformer([
        ('num', num_prep, num_cols),
        ('cat', cat_prep, cat_cols)
        ],
        remainder='drop')
    
    # Apply transformations to training and test sets
    X_train_trans = prep.fit_transform(X_train)
    X_test_trans = prep.transform(X_test)

    #X_train_trans, y_train = smote_tomek_pipeline.fit_resample(X_train_trans, y_train)

    # Output the shapes of the balanced datasets
    print(f"Balanced X_train shape: {X_train_trans.shape}")
    print(f"Balanced y_train shape: {y_train.shape}")

    X_train = X_train_trans
    X_test = X_test_trans
    X = X_train
    y = y_train

    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y),
        y=y
        )
    class_weights = dict(enumerate(class_weights))

    # Updated models
    models = {
        'RandomForestClassifier': RandomForestClassifier(
            random_state=RANDOM_STATE, 
            class_weight="balanced"
        ),
        'XGBClassifier': XGBClassifier(
            random_state=RANDOM_STATE, 
            scale_pos_weight=class_weights[1] / class_weights[0]  # Adjusted for imbalance
        ),
        'CatBoostClassifier': CatBoostClassifier(
            verbose=False, 
            random_state=RANDOM_STATE, 
            class_weights=class_weights
        ),
        'LightGBMClassifier': LGBMClassifier(
            random_state=RANDOM_STATE, 
            class_weight="balanced",
            verbose=-1
        ),
        'StackingClassifier': StackingClassifier(
            estimators=[
                ('rf', RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced")),
                ('xgb', XGBClassifier(random_state=RANDOM_STATE, scale_pos_weight=class_weights[1] / class_weights[0])),
                ('catboost', CatBoostClassifier(verbose=False, random_state=RANDOM_STATE, class_weights=class_weights)),
                ('lgbm', LGBMClassifier(random_state=RANDOM_STATE, class_weight="balanced", verbose=-1))
            ],
            final_estimator=LogisticRegression(class_weight="balanced"),
            cv=5
        ),
        'BalancedRandomForestClassifier': BalancedRandomForestClassifier(
            random_state=RANDOM_STATE, sampling_strategy='not minority', replacement=True, bootstrap=False
        )
    }
    
    # Initialize evaluator
    evaluator = ModelEvaluatorWithLibrary(models=models)
    
    scores = evaluator.score_models(X_train, y_train, X_test, y_test)
    print(scores)

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
DATASET: /kaggle/input/dataset/german.csv

Status_loan
700
300
Balanced X_train shape: (800, 61)
Balanced y_train shape: (800,)
Evaluating model: RandomForestClassifier
Evaluating model: XGBClassifier
Evaluating model: CatBoostClassifier
Evaluating model: LightGBMClassifier
Evaluating model: StackingClassifier
Evaluating model: BalancedRandomForestClassifier
{'RandomForestClassifier': {'test': {'accuracy': 0.78, 'f1': 0.5510204081632654, 'auc_roc': 0.7738690476190477}}, 'XGBClassifier': {'test': {'accuracy': 0.75, 'f1': 0.576271186440678, 'auc_roc': 0.7825}}, 'CatBoostClassifier': {'test': {'accuracy': 0.77, 'f1': 0.640625, 'auc_roc': 0.8141666666666667}}, 'LightGBMClassifier': {'test': {'accuracy': 0.73, 'f1': 0.55, 'auc_roc': 0.7602380952380953}}, 'StackingClassifier': {'test': {'accur