In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score, precision_score, recall_score

In [3]:
df = pd.read_csv('../data/01_creditcard.csv')
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [4]:
# Preprocessing
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])
# df = df.drop(['Time'], axis=1)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [5]:
# Split the data into features and target using time-aware split
df = df.sort_values(by='Time').reset_index(drop=True)
df = df.drop(['Time'], axis=1)

train, temp = train_test_split(df, test_size=0.4, shuffle=False)
val, test = train_test_split(temp, test_size=0.5, shuffle=False)

X_train, y_train = train.drop('Class', axis=1), train['Class']
X_val, y_val = val.drop('Class', axis=1), val['Class']
X_test, y_test = test.drop('Class', axis=1), test['Class']

In [6]:
# Handle imbalanced data
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.combine import SMOTEENN

random_state = 42

print("Class distribution before handling imbalanced data:")
print(y_train.value_counts())
print("\nClass proportion before handling imbalanced data:")
print(y_train.value_counts(normalize=True))

# 1. Regular SMOTE
sm = SMOTE(random_state=random_state)
resampled_data = sm.fit_resample(X_train, y_train)
X_train_smote, y_train_smote = resampled_data

# Print class proportion after SMOTE
print("Class distribution after SMOTE:")
print(y_train_smote.value_counts())
print("\nClass proportion after SMOTE:")
print(y_train_smote.value_counts(normalize=True))

# 2. Borderline SMOTE
blsmote = BorderlineSMOTE(random_state=random_state)
resampled_data_bl = blsmote.fit_resample(X_train, y_train)
X_train_blsmote, y_train_blsmote = resampled_data_bl

print("Class distribution after Borderline SMOTE:")
print(y_train_blsmote.value_counts())
print("\nClass proportion after Borderline SMOTE:")
print(y_train_blsmote.value_counts(normalize=True))

# 3. SMOTEENN
smote_enn = SMOTEENN(random_state=random_state)
resampled_data_se = smote_enn.fit_resample(X_train, y_train)
X_train_smoteenn, y_train_smoteenn = resampled_data_se

print("Class distribution after SMOTEENN:")
print(y_train_smoteenn.value_counts())
print("\nClass proportion after SMOTEENN:")
print(y_train_smoteenn.value_counts(normalize=True))

Class distribution before handling imbalanced data:
Class
0    170524
1       360
Name: count, dtype: int64

Class proportion before handling imbalanced data:
Class
0    0.997893
1    0.002107
Name: proportion, dtype: float64
Class distribution after SMOTE:
Class
0    170524
1    170524
Name: count, dtype: int64

Class proportion after SMOTE:
Class
0    0.5
1    0.5
Name: proportion, dtype: float64
Class distribution after Borderline SMOTE:
Class
0    170524
1    170524
Name: count, dtype: int64

Class proportion after Borderline SMOTE:
Class
0    0.5
1    0.5
Name: proportion, dtype: float64
Class distribution after SMOTEENN:
Class
1    170524
0    170214
Name: count, dtype: int64

Class proportion after SMOTEENN:
Class
1    0.500455
0    0.499545
Name: proportion, dtype: float64


In [16]:
# Implement GAN and CTGAN for data synthesis
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import warnings
warnings.filterwarnings('ignore')


# Define PyTorch GAN components
class Generator(nn.Module):
    def __init__(self, noise_dim, data_dim, hidden_dim=128):
        super(Generator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(noise_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim * 2),  # 128 -> 256
            nn.BatchNorm1d(hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim),  # 256 -> 128
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, data_dim),    # 128 -> data_dim
            nn.Tanh()
        )

    def forward(self, noise):
        return self.network(noise)

class Discriminator(nn.Module):
    def __init__(self, data_dim, hidden_dim=128):
        super(Discriminator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(data_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )

    def forward(self, data):
        return self.network(data)

class GANOversampler:
    def __init__(self, data_dim, noise_dim=100, hidden_dim=128, lr=0.0002, device='mps'):
        self.device = device
        self.noise_dim = noise_dim
        self.data_dim = data_dim

        self.generator = Generator(noise_dim, data_dim, hidden_dim).to(device)
        self.discriminator = Discriminator(data_dim, hidden_dim).to(device)

        self.gen_optimizer = optim.Adam(self.generator.parameters(), lr=lr, betas=(0.5, 0.999))
        self.disc_optimizer = optim.Adam(self.discriminator.parameters(), lr=lr, betas=(0.5, 0.999))

        self.criterion = nn.BCELoss()
        self.scaler = StandardScaler()

    def train(self, minority_data, epochs=500, batch_size=64, print_interval=100):
        minority_data_scaled = self.scaler.fit_transform(minority_data)
        minority_tensor = torch.FloatTensor(minority_data_scaled).to(self.device)
        dataloader = DataLoader(TensorDataset(minority_tensor), batch_size=batch_size, shuffle=True)

        gen_losses = []
        disc_losses = []

        for epoch in range(epochs):
            epoch_gen_loss = 0.0
            epoch_disc_loss = 0.0

            for batch_idx, (real_data,) in enumerate(dataloader):
                current_batch_size = real_data.size(0)

                # Train Discriminator
                self.disc_optimizer.zero_grad()

                real_labels = torch.ones(current_batch_size, 1).to(self.device)
                real_output = self.discriminator(real_data)
                real_loss = self.criterion(real_output, real_labels)

                noise = torch.randn(current_batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise)
                fake_labels = torch.zeros(current_batch_size, 1).to(self.device)
                fake_output = self.discriminator(fake_data.detach())
                fake_loss = self.criterion(fake_output, fake_labels)

                disc_loss = real_loss + fake_loss
                disc_loss.backward()
                self.disc_optimizer.step()

                # Train Generator
                self.gen_optimizer.zero_grad()
                noise = torch.randn(current_batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise)
                fake_output = self.discriminator(fake_data)
                gen_loss = self.criterion(fake_output, real_labels)

                gen_loss.backward()
                self.gen_optimizer.step()

                epoch_gen_loss += gen_loss.item()
                epoch_disc_loss += disc_loss.item()
            
            gen_losses.append(epoch_gen_loss / len(dataloader))
            disc_losses.append(epoch_disc_loss / len(dataloader))

            if (epoch + 1) % print_interval == 0:
                print(f"Epoch [{epoch + 1}/{epochs}], Gen Loss: {gen_losses[-1]:.4f}, Disc Loss: {disc_losses[-1]:.4f}")
    
        return gen_losses, disc_losses
        
    def generate_samples(self, n_samples):
        self.generator.eval()
        with torch.no_grad():
            noise = torch.randn(n_samples, self.noise_dim).to(self.device)
            synthetic_data = self.generator(noise)

        # Inverse transform the data to original scale
        synthetic_data_np = synthetic_data.cpu().numpy()
        synthetic_data_inv = self.scaler.inverse_transform(synthetic_data_np)

        return synthetic_data_inv

        
# Pytorch GAN-based oversampling function
def oversample_with_pytorch_gan(X_train, y_train, target_class=1, oversample_ratio=1.0, epochs=500, batch_size=64):
    """
    Oversample minority class using Pytorch GAN.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training labels.
        target_class (int): The minority class label to oversample.
        oversample_ratio (float): The ratio of oversampling. E.g., 1.0 means double the minority class.
        epochs (int): Number of training epochs for GAN.
        batch_size (int): Batch size for GAN training.
    Returns:
        X_balanced (pd.DataFrame): Balanced training features.
        y_balanced (pd.Series): Balanced training labels.
        gen_losses (list): Generator losses during training.
        disc_losses (list): Discriminator losses during training.
    """
    # Convert to numpy arrays if needed
    X_np = X_train.values if isinstance(X_train, pd.DataFrame) else np.array(X_train)
    y_np = y_train.values if isinstance(y_train, pd.Series) else np.array(y_train)

    minority_mask = y_np == target_class
    majority_mask = y_np != target_class

    minority_data = X_np[minority_mask]
    n_majority = np.sum(majority_mask)
    n_minority = len(minority_data)

    print("Original class distribution:")
    print(f"Majority class (0): {n_majority} samples")
    print(f"Minority class (1): {n_minority} samples")

    # Calculate number of samples to generate
    n_generate = int(n_majority * oversample_ratio) - n_minority

    if n_generate <= 0:
        print("No oversampling needed")
        return X_train, y_train, [], []
    
    # Setup device
    device = 'mps' if torch.backends.mps.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Initialize and train GAN oversampler
    gan_oversampler = GANOversampler(
        data_dim=minority_data.shape[1],
        noise_dim=100,
        hidden_dim=128,
        lr=0.0002,
        device=device
    )

    print(f"Training PyTorch GAN for {epochs} epochs...")
    gen_losses, disc_losses = gan_oversampler.train(
        minority_data, 
        epochs=epochs, 
        batch_size=batch_size, 
        print_interval=100
    )
    
    # Generate synthetic samples
    print(f"Generating {n_generate} synthetic samples...")
    synthetic_data = gan_oversampler.generate_samples(n_generate)
    synthetic_labels = np.full(n_generate, target_class)

    # Combine original and synthetic data
    X_balanced = np.vstack([X_np, synthetic_data])
    y_balanced = np.concatenate([np.asarray(y_np), synthetic_labels])

    print(f"\nFinal class distribution after Pytorch GAN oversampling:")
    print(f"Majority class (0): {np.sum(y_balanced == 0)} samples")
    print(f"Minority class (1): {np.sum(y_balanced == 1)} samples")

    return X_balanced, y_balanced, gen_losses, disc_losses

# CTGAN-based oversampling function using Pytorch CTGAN
class CTGANGenerator(nn.Module):
    def __init__(self, noise_dim, data_dim, hidden_dim=128):  # FIX: swap order
        super(CTGANGenerator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(noise_dim, hidden_dim),  # Now correct: 100 -> 128
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, data_dim),  # 128 -> 29
            nn.Tanh()
        )

    def forward(self, noise):
        return self.network(noise)
    
class CTGANDiscriminator(nn.Module):
    def __init__(self, data_dim, hidden_dim=128):
        super(CTGANDiscriminator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(data_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, 1),
        )
    
    def forward(self, data):
        return self.network(data)
    
class SimplifiedCTGAN:
    def __init__(self, data_dim, noise_dim=100, hidden_dim=128, lr=2e-4, device='mps'):
        self.device = device
        self.noise_dim = noise_dim
        self.data_dim = data_dim

        self.generator = CTGANGenerator(noise_dim, data_dim, hidden_dim).to(device)
        self.discriminator = CTGANDiscriminator(data_dim, hidden_dim).to(device)

        self.gen_optimizer = optim.Adam(self.generator.parameters(), lr=lr, betas=(0.5, 0.9))
        self.disc_optimizer = optim.Adam(self.discriminator.parameters(), lr=lr, betas=(0.5, 0.9))

        self.scaler = StandardScaler()

    def compute_gradient_penalty(self, real_data, fake_data, lambda_gp=10):
        batch_size = real_data.size(0)
        alpha = torch.rand(batch_size, 1).to(self.device)
        alpha = alpha.expand_as(real_data)

        interpolates = alpha * real_data + (1 - alpha) * fake_data
        interpolates.requires_grad_(True)

        disc_interpolates = self.discriminator(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=torch.ones_like(disc_interpolates),
            create_graph=True,
            retain_graph=True,
        )[0]
        
        gradients = gradients.view(batch_size, -1)
        gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lambda_gp
        
        return gradient_penalty
    
    def train(self, minority_data, epochs=500, batch_size=64, print_interval=100, n_critic=5):
        # Scale data
        minority_data_scaled = self.scaler.fit_transform(minority_data)
        minority_tensor = torch.FloatTensor(minority_data_scaled).to(self.device)
        dataloader = DataLoader(TensorDataset(minority_tensor), batch_size=batch_size, shuffle=True)

        gen_losses = []
        disc_losses = []

        print("="*60)
        for epoch in range(epochs):
            epoch_gen_loss = 0.0
            epoch_disc_loss = 0.0

            for batch_idx, (real_data,) in enumerate(dataloader):
                current_batch_size = real_data.size(0)

                # Train discriminator
                for _ in range(n_critic):
                    self.disc_optimizer.zero_grad()

                    # Real data
                    real_validity = self.discriminator(real_data)

                    noise = torch.randn(current_batch_size, self.noise_dim).to(self.device)
                    fake_data = self.generator(noise).detach()
                    fake_validity = self.discriminator(fake_data)

                    # Gradient penalty
                    gp = self.compute_gradient_penalty(real_data, fake_data)

                    # Wasserstein loss with gradient penalty
                    disc_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + gp
                    disc_loss.backward()
                    self.disc_optimizer.step()

                    epoch_disc_loss += disc_loss.item()

                # train generator
                self.gen_optimizer.zero_grad()

                noise = torch.randn(current_batch_size, self.noise_dim).to(self.device)
                fake_data = self.generator(noise)
                fake_validity = self.discriminator(fake_data)

                gen_loss = -torch.mean(fake_validity)
                gen_loss.backward()
                self.gen_optimizer.step()

                epoch_gen_loss += gen_loss.item()
            
            # Store average losses
            gen_losses.append(epoch_gen_loss / len(dataloader))
            disc_losses.append(epoch_disc_loss / (len(dataloader) * n_critic))

            # Print progress every print_interval epochs
            if (epoch + 1) % print_interval == 0:
                print(f"Epoch [{epoch + 1}/{epochs}], Gen Loss: {gen_losses[-1]:.4f}, Disc Loss: {disc_losses[-1]:.4f}")

        print("="*60)

        return gen_losses, disc_losses
    
    def generate_samples(self, n_samples):
        self.generator.eval()
        with torch.no_grad():
            noise = torch.randn(n_samples, self.noise_dim).to(self.device)
            synthetic_data = self.generator(noise)

        # Inverse transform the data to original scale
        synthetic_data_np = synthetic_data.cpu().numpy()
        synthetic_data_inv = self.scaler.inverse_transform(synthetic_data_np)

        return synthetic_data_inv
    
# Simplified CTGAN-based oversampling function
def oversample_with_ctgan(X_train, y_train, target_class=1, oversample_ratio=1.0, epochs=500, batch_size=64):
    """
    Oversample minority class using simplified CTGAN.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training labels.
        target_class (int): The minority class label to oversample.
        oversample_ratio (float): The ratio of oversampling. E.g., 1.0 means double the minority class.
        epochs (int): Number of training epochs for CTGAN.
        batch_size (int): Batch size for CTGAN training.
    Returns:
        X_balanced (pd.DataFrame): Balanced training features.
        y_balanced (pd.Series): Balanced training labels.
        gen_losses (list): Generator losses during training.
        disc_losses (list): Discriminator losses during training.
    """
    # Convert to numpy arrays if needed
    X_np = X_train.values if isinstance(X_train, pd.DataFrame) else np.array(X_train)
    y_np = y_train.values if isinstance(y_train, pd.Series) else np.array(y_train)

    minority_mask = y_np == target_class
    majority_mask = y_np != target_class

    minority_data = X_np[minority_mask]
    n_majority = np.sum(majority_mask)
    n_minority = len(minority_data)

    print("Original class distribution:")
    print(f"Majority class (0): {n_majority} samples")
    print(f"Minority class (1): {n_minority} samples")

    # Calculate number of samples to generate
    n_generate = int(n_majority * oversample_ratio) - n_minority

    if n_generate <= 0:
        print("No oversampling needed")
        return X_train, y_train, [], []
    
    # Setup device
    device = 'mps' if torch.backends.mps.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Initialize and train CTGAN oversampler
    ctgan_oversampler = SimplifiedCTGAN(
        data_dim=minority_data.shape[1],
        noise_dim=100,
        hidden_dim=128,
        lr=0.0002,
        device=device
    )

    print(f"Training Simplified CTGAN for {epochs} epochs...")
    gen_losses, disc_losses = ctgan_oversampler.train(
        minority_data, 
        epochs=epochs, 
        batch_size=batch_size, 
        print_interval=100
    )

    # Generate synthetic samples
    print(f"Generating {n_generate} synthetic samples...")
    synthetic_samples = ctgan_oversampler.generate_samples(n_generate)
    synthetic_labels = np.full(n_generate, target_class, dtype=np.int64)

    # Combine data
    X_balanced = np.vstack([X_np, synthetic_samples])
    y_balanced = np.concatenate([np.asarray(y_np), synthetic_labels])

    print(f"\nFinal class distribution after CTGAN oversampling:")
    print(f"Majority class (0): {np.sum(y_balanced == 0)} samples")
    print(f"Minority class (1): {np.sum(y_balanced == 1)} samples")

    return X_balanced, y_balanced, gen_losses, disc_losses


### XGB Baseline Implementation

XGB on original data.

In [8]:
# Install OpenMP runtime for XGBoost on Mac
import subprocess
# import sys

try:
    # Try to install libomp using brew
    subprocess.run(['brew', 'install', 'libomp'], check=True, capture_output=True)
    print("OpenMP runtime installed successfully")
except subprocess.CalledProcessError:
    print("Failed to install OpenMP runtime. Please run 'brew install libomp' manually in terminal")
except FileNotFoundError:
    print("Homebrew not found. Please install Homebrew first or run 'brew install libomp' manually")

# Train a model
from xgboost import XGBClassifier

params = {
    'n_estimators': 300,
    'max_depth': 5,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': (len(y_train)/y_train.sum()),
}

xgb_model = XGBClassifier(
    **params,
    use_label_encoder=False
)

xgb_model.fit(
    X_train,
    y_train
)

OpenMP runtime installed successfully


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### XGB on resampled data by SMOTE, BorderlineSMOTE, and SMOTENN

In [9]:
# Baseline 
# Train XGB on resampled data by SMOTE
xgb_smote = XGBClassifier(
    **params,
    use_label_encoder=False
)

xgb_smote.fit(
    X_train_smote,
    y_train_smote
)

# Train XGB on resampled data by Borderline SMOTE
xgb_blsmote = XGBClassifier(
    **params,
    use_label_encoder=False
)

xgb_blsmote.fit(
    X_train_blsmote,
    y_train_blsmote
)

# Train XGB on resampled data by SMOTEENN
xgb_smoteenn = XGBClassifier(
    **params,
    use_label_encoder=False
)

xgb_smoteenn.fit(
    X_train_smoteenn,
    y_train_smoteenn
)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### XGB on resample data by Pytorch GAN

In [10]:
# Train XGB on resampled data by Pytorch GAN
X_train_gan, y_train_gan, gen_losses, disc_losses = oversample_with_pytorch_gan(
    X_train,
    y_train,
    target_class=1,
    oversample_ratio=1.0,
    epochs=500,
    batch_size=64
)
xgb_gan = XGBClassifier(
    **params,
    use_label_encoder=False
)
xgb_gan.fit(
    X_train_gan,
    y_train_gan
)

Original class distribution:
Majority class (0): 170524 samples
Minority class (1): 360 samples
Using device: mps
Training PyTorch GAN for 500 epochs...
Epoch [100/500], Gen Loss: 0.8100, Disc Loss: 1.1973
Epoch [200/500], Gen Loss: 0.9015, Disc Loss: 1.1577
Epoch [300/500], Gen Loss: 0.9829, Disc Loss: 1.1132
Epoch [400/500], Gen Loss: 0.9592, Disc Loss: 1.1144
Epoch [500/500], Gen Loss: 1.0218, Disc Loss: 1.1006
Generating 170164 synthetic samples...

Final class distribution after Pytorch GAN oversampling:
Majority class (0): 170524 samples
Minority class (1): 170524 samples


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### XGB on Resampled Data by CTGAN (Pytorch)

In [17]:
# Train XGB on resampled data by CTGAN
X_train_ctgan, y_train_ctgan, gen_losses_ctgan, disc_losses_ctgan = oversample_with_ctgan(
    X_train,
    y_train,
    target_class=1,
    oversample_ratio=1.0,
    epochs=500,
    batch_size=64
)

xgb_ctgan = XGBClassifier(
    **params,
    use_label_encoder=False
)

xgb_ctgan.fit(
    X_train_ctgan,
    y_train_ctgan
)

Original class distribution:
Majority class (0): 170524 samples
Minority class (1): 360 samples
Using device: mps
Training Simplified CTGAN for 500 epochs...
Epoch [100/500], Gen Loss: -1.1678, Disc Loss: -2.7735
Epoch [200/500], Gen Loss: -1.1202, Disc Loss: -1.6862
Epoch [300/500], Gen Loss: -1.2915, Disc Loss: -0.8306
Epoch [400/500], Gen Loss: -0.8275, Disc Loss: -0.7478
Epoch [500/500], Gen Loss: -1.8303, Disc Loss: -1.6310
Generating 170164 synthetic samples...

Final class distribution after CTGAN oversampling:
Majority class (0): 170524 samples
Minority class (1): 170524 samples


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### Calibrate and Evaluate Baseline Model

In [19]:
from sklearn.calibration import CalibratedClassifierCV

cal_xgb = CalibratedClassifierCV(xgb_model, method='sigmoid')
cal_xgb.fit(X_val, y_val)

# Evaluation baseline model

# PR-AUC
probs = xgb_model.predict_proba(X_test)[:, 1]
ap = average_precision_score(y_test, probs)
print(f"PR-AUC: {ap}")

# Precision, Recall, F1-Score at threshold 0.5
prec, rec, th = precision_recall_curve(y_test, probs)
# print(f"Precision-Recall curve points: {list(zip(prec, rec, th))}")

# Recall@Precision >= 90%
precision_threshold = 0.9
valid_points = [(r, p) for p, r in zip(prec, rec) if p >= precision_threshold]
rec_at_prec_90 = max([r for (r, p) in valid_points])
# print(f"Recall at Precision >= {precision_threshold*100}%: {rec_at_prec_90}")

# Standard metrics at threshold 0.5
pred = (probs >= 0.5).astype(int)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
print(f"F1-Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Evaluation calibrated model
probs_cal = cal_xgb.predict_proba(X_test)[:, 1]
ap_cal = average_precision_score(y_test, probs_cal)
print(f"Calibrated PR-AUC: {ap_cal}")

# Precision, Recall, F1-Score
prec_cal, rec_cal, th_cal = precision_recall_curve(y_test, probs_cal)
# print(f"Calibrated Precision-Recall curve points: {list(zip(prec_cal, rec_cal, th_cal))}")

# Recall@Precision >= 90%
precision_threshold = 0.9
valid_points_cal = [(r, p) for p, r in zip(prec_cal, rec_cal) if p >= precision_threshold]
rec_at_prec_90_cal = max([r for (r, p) in valid_points_cal])
# print(f"Calibrated Recall at Precision >= {precision_threshold*100}%: {rec_at_prec_90_cal}")

# Standard metrics at threshold 0.5
pred_cal = (probs_cal >= 0.5).astype(int)
f1_cal = f1_score(y_test, pred_cal)
precision_cal = precision_score(y_test, pred_cal)
recall_cal = recall_score(y_test, pred_cal)
print(f"Calibrated F1-Score: {f1_cal}")
print(f"Calibrated Precision: {precision_cal}")
print(f"Calibrated Recall: {recall_cal}")

PR-AUC: 0.7874331602718075
F1-Score: 0.7916666666666666
Precision: 0.8260869565217391
Recall: 0.76
Calibrated PR-AUC: 0.7858265261616966
Calibrated F1-Score: 0.8396946564885496
Calibrated Precision: 0.9821428571428571
Calibrated Recall: 0.7333333333333333


In [20]:
# Evaluate XGB on resampled data by SMOTE, Borderline SMOTE, SMOTEENN, Pytorch GAN, CTGAN

def evaluate_model(model, X_test, y_test, model_name="Model"):
    print(f"Evaluating {model_name}...")
    probs = model.predict_proba(X_test)[:, 1]
    ap = average_precision_score(y_test, probs)
    print(f"PR-AUC: {ap}")

    prec, rec, th = precision_recall_curve(y_test, probs)

    # Recall@Precision >= 90%
    precision_threshold = 0.9
    valid_points = [(r, p) for p, r in zip(prec, rec) if p >= precision_threshold]
    rec_at_prec_90 = max([r for (r, p) in valid_points])
    print(f"Recall at Precision >= {precision_threshold*100}%: {rec_at_prec_90}")

    # Standard metrics at threshold 0.5
    pred = (probs >= 0.5).astype(int)
    f1 = f1_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print(f"F1-Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print("-"*40)

evaluate_model(xgb_model, X_test, y_test, model_name="Baseline XGB")
evaluate_model(cal_xgb, X_test, y_test, model_name="Calibrated XGB")
evaluate_model(xgb_smote, X_test, y_test, model_name="XGB with SMOTE")
evaluate_model(xgb_blsmote, X_test, y_test, model_name="XGB with Borderline SMOTE")
evaluate_model(xgb_smoteenn, X_test, y_test, model_name="XGB with SMOTEENN")
evaluate_model(xgb_gan, X_test, y_test, model_name="XGB with Pytorch GAN Oversampling")
evaluate_model(xgb_ctgan, X_test, y_test, model_name="XGB with CTGAN Oversampling")

Evaluating Baseline XGB...
PR-AUC: 0.7874331602718075
Recall at Precision >= 90.0%: 0.7333333333333333
F1-Score: 0.7916666666666666
Precision: 0.8260869565217391
Recall: 0.76
----------------------------------------
Evaluating Calibrated XGB...
PR-AUC: 0.7858265261616966
Recall at Precision >= 90.0%: 0.7333333333333333
F1-Score: 0.8396946564885496
Precision: 0.9821428571428571
Recall: 0.7333333333333333
----------------------------------------
Evaluating XGB with SMOTE...
PR-AUC: 0.7644618669688699
Recall at Precision >= 90.0%: 0.7333333333333333
F1-Score: 0.1631578947368421
Precision: 0.0905109489051095
Recall: 0.8266666666666667
----------------------------------------
Evaluating XGB with Borderline SMOTE...
PR-AUC: 0.7788447245840217
Recall at Precision >= 90.0%: 0.76
F1-Score: 0.6413043478260869
Precision: 0.5412844036697247
Recall: 0.7866666666666666
----------------------------------------
Evaluating XGB with SMOTEENN...
PR-AUC: 0.7689247938069732
Recall at Precision >= 90.0%: 0.

In [21]:
def evaluate_models_to_dataframe(models_dict, X_test, y_test):
    """
    Evaluate multiple models and return results in a DataFrame.
    
    Args:
        models_dict (dict): Dictionary with model names as keys and model objects as values
        X_test: Test features
        y_test: Test labels
    
    Returns:
        pd.DataFrame: DataFrame containing all evaluation metrics for each model
    """
    results = []
    
    for model_name, model in models_dict.items():
        # Get predictions
        probs = model.predict_proba(X_test)[:, 1]
        pred = (probs >= 0.5).astype(int)
        
        # Calculate PR-AUC
        ap = average_precision_score(y_test, probs)
        
        # Calculate Precision-Recall curve
        prec, rec, th = precision_recall_curve(y_test, probs)
        
        # Calculate Recall@Precision >= 90%
        precision_threshold = 0.9
        valid_points = [(r, p) for p, r in zip(prec, rec) if p >= precision_threshold]
        rec_at_prec_90 = max([r for (r, p) in valid_points]) if valid_points else 0.0
        
        # Calculate standard metrics at threshold 0.5
        f1 = f1_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        
        # Store results
        results.append({
            'Model': model_name,
            'PR-AUC': ap,
            'Recall@Precision>=90%': rec_at_prec_90,
            'F1-Score': f1,
            'Precision': precision,
            'Recall': recall
        })
    
    # Create DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

# Create dictionary of models
models_dict = {
    'Baseline XGB': xgb_model,
    'Calibrated XGB': cal_xgb,
    'XGB with SMOTE': xgb_smote,
    'XGB with Borderline SMOTE': xgb_blsmote,
    'XGB with SMOTEENN': xgb_smoteenn,
    'XGB with Pytorch GAN': xgb_gan,
    'XGB with CTGAN': xgb_ctgan
}

# Evaluate all models and get results as DataFrame
results_df = evaluate_models_to_dataframe(models_dict, X_test, y_test)
print(results_df)

                       Model    PR-AUC  Recall@Precision>=90%  F1-Score  \
0               Baseline XGB  0.787433               0.733333  0.791667   
1             Calibrated XGB  0.785827               0.733333  0.839695   
2             XGB with SMOTE  0.764462               0.733333  0.163158   
3  XGB with Borderline SMOTE  0.778845               0.760000  0.641304   
4          XGB with SMOTEENN  0.768925               0.746667  0.153266   
5       XGB with Pytorch GAN  0.756042               0.693333  0.753247   
6             XGB with CTGAN  0.777198               0.680000  0.770270   

   Precision    Recall  
0   0.826087  0.760000  
1   0.982143  0.733333  
2   0.090511  0.826667  
3   0.541284  0.786667  
4   0.084605  0.813333  
5   0.734177  0.773333  
6   0.780822  0.760000  
