# Financial fraud detection Transformer model

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from bayes_opt import BayesianOptimization
from tqdm import tqdm

# Data Preprocessing & Feature Engineering

In [12]:
# Load dataset
df = pd.read_csv('fraudTest.csv')

# Fill missing values
df.fillna(method='ffill', inplace=True)

# Outlier detection and replacement
df['amt'] = np.where(df['amt'] > df['amt'].quantile(0.95), df['amt'].quantile(0.95), df['amt'])
df['amt'] = np.where(df['amt'] < df['amt'].quantile(0.05), df['amt'].quantile(0.05), df['amt'])

# Drop non-essential columns
columns_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num', 'unix_time']
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Convert dates and extract components
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df.drop(columns=['trans_date_trans_time', 'dob'], axis=1, inplace=True)

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['amt', 'age', 'hour', 'day_of_week']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Define and encode categorical columns
categorical_columns = ['gender', 'category', 'state', 'merchant', 'city', 'zip', 'job']
label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_columns}
for col, encoder in label_encoders.items():
    df[col] = encoder.transform(df[col])

# Separate features and target
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Applying SMOTE to balance the dataset
smote = SMOTE(sampling_strategy=0.2, random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Convert data to tensors
X_train_categorical = torch.tensor(X_train[categorical_columns].values, dtype=torch.long)
X_train_numerical = torch.tensor(X_train[numerical_columns].values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)

X_test_categorical = torch.tensor(X_test[categorical_columns].values, dtype=torch.long)
X_test_numerical = torch.tensor(X_test[numerical_columns].values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Dataset and DataLoader
train_dataset = TensorDataset(X_train_categorical, X_train_numerical, y_train)
test_dataset = TensorDataset(X_test_categorical, X_test_numerical, y_test)

# Embedding dimensions
embedding_sizes = [(len(label_encoders[col].classes_), min(10, (len(label_encoders[col].classes_)+1)//2)) for col in categorical_columns]
num_numerical_features = X_train_numerical.shape[1]

  df.fillna(method='ffill', inplace=True)


## Create TransformerModel

In [13]:
class TransformerModel(nn.Module):
    def __init__(self, num_numerical, embedding_sizes, num_classes=1, nhead=4, num_layers=1, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num_classes, size) for num_classes, size in embedding_sizes])
        total_emb_size = sum(size for _, size in embedding_sizes)
        d_model = total_emb_size + num_numerical
        
        if d_model % nhead != 0:
            while d_model % nhead != 0:
                nhead -= 1
            print(f"Adjusted 'nhead' to {nhead} to be divisible by 'd_model' {d_model}")
        
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=512,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(d_model, num_classes)
        self.activation = nn.Sigmoid()
    
    def forward(self, x_cat, x_num):
        embeddings = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_emb = torch.cat(embeddings, dim=1)
        x_combined = torch.cat((x_emb, x_num), dim=1).unsqueeze(1)
        x_encoded = self.transformer_encoder(x_combined)
        x_out = self.linear(x_encoded.squeeze(1))
        return self.activation(x_out)

## Hyperparameter tuning

In [None]:
def train_evaluate(learning_rate, dropout, num_layers, batch_size):
    model = TransformerModel(
        num_numerical=num_numerical_features,
        embedding_sizes=embedding_sizes,
        nhead=4,
        num_layers=int(num_layers),
        dropout=dropout
    )
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.BCELoss(weight=torch.tensor([15.0]))  # Increased weight for positive class
    
    train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=int(batch_size), shuffle=False)
    
    best_score = 0
    best_model_state = None
    patience = 3
    counter = 0
    
    for epoch in range(8):  # Increased maximum epochs
        model.train()
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training")
        for X_cat_batch, X_num_batch, y_batch in train_pbar:
            optimizer.zero_grad()
            outputs = model(X_cat_batch, X_num_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
        
        model.eval()
        y_true = []
        y_pred = []
        test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1} Evaluation")
        with torch.no_grad():
            for X_cat, X_num, y in test_pbar:
                outputs = model(X_cat, X_num)
                predictions = (outputs > 0.5).float()
                y_true.extend(y.tolist())
                y_pred.extend(predictions.flatten().tolist())
        
        f1 = f1_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        weighted_score = 0.3 * f1 + 0.7 * recall  # Weighted average
        print(f"Epoch {epoch+1}, F1: {f1:.4f}, Recall: {recall:.4f}, Weighted Score: {weighted_score:.4f}")
        
        if weighted_score == 0:
            print(f"Weighted Score is 0. Stopping early and moving to next parameter combination.")
            return 0  # Return 0 to indicate this parameter combination is not good
        
        if weighted_score > best_score:
            best_score = weighted_score
            best_model_state = model.state_dict()
            counter = 0
        else:
            counter += 1
        
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    # Load the best model state
    model.load_state_dict(best_model_state)
    return best_score

In [14]:
# Bayesian Optimization
pbounds = {
    'learning_rate': (1e-5, 1e-3),
    'dropout': (0.1, 0.5),
    'num_layers': (1, 3),
    'batch_size': (32, 256)
}

optimizer = BayesianOptimization(
    f=train_evaluate,
    pbounds=pbounds,
    random_state=42,
)

print("Starting Bayesian Optimization...")
optimizer.maximize(
    init_points=3,
    n_iter=6,
)

print("Best parameters:", optimizer.max['params'])
print("Best score (average of F1 and Recall):", optimizer.max['target'])



Starting Bayesian Optimization...
|   iter    |  target   | batch_... |  dropout  | learni... | num_la... |
-------------------------------------------------------------------------
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 4622/4622 [00:56<00:00, 81.23it/s, Loss=1.1164]
Epoch 1 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 334.87it/s]


Epoch 1, F1: 0.2062, Recall: 0.7343, Weighted Score: 0.5758


Epoch 2 Training: 100%|██████████| 4622/4622 [00:59<00:00, 78.09it/s, Loss=2.1134]
Epoch 2 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 349.74it/s]


Epoch 2, F1: 0.2709, Recall: 0.8135, Weighted Score: 0.6507


Epoch 3 Training: 100%|██████████| 4622/4622 [00:59<00:00, 78.09it/s, Loss=0.4923]
Epoch 3 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 333.45it/s]


Epoch 3, F1: 0.3625, Recall: 0.7925, Weighted Score: 0.6635


Epoch 4 Training: 100%|██████████| 4622/4622 [00:57<00:00, 80.86it/s, Loss=0.0712]
Epoch 4 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 322.54it/s]


Epoch 4, F1: 0.3937, Recall: 0.7809, Weighted Score: 0.6647


Epoch 5 Training: 100%|██████████| 4622/4622 [00:55<00:00, 82.82it/s, Loss=0.0217]
Epoch 5 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 349.47it/s]


Epoch 5, F1: 0.5023, Recall: 0.7692, Weighted Score: 0.6891


Epoch 6 Training: 100%|██████████| 4622/4622 [00:55<00:00, 83.09it/s, Loss=2.7751]
Epoch 6 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 330.56it/s]


Epoch 6, F1: 0.4708, Recall: 0.7879, Weighted Score: 0.6927


Epoch 7 Training: 100%|██████████| 4622/4622 [00:55<00:00, 82.65it/s, Loss=0.0387]
Epoch 7 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 326.24it/s]


Epoch 7, F1: 0.5402, Recall: 0.7902, Weighted Score: 0.7152


Epoch 8 Training: 100%|██████████| 4622/4622 [00:55<00:00, 82.85it/s, Loss=0.0073]
Epoch 8 Evaluation: 100%|██████████| 967/967 [00:02<00:00, 355.18it/s]


Epoch 8, F1: 0.5516, Recall: 0.7786, Weighted Score: 0.7105
| [39m1        [39m | [39m0.7152   [39m | [39m115.9    [39m | [39m0.4803   [39m | [39m0.0007347[39m | [39m2.197    [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 8052/8052 [01:21<00:00, 99.25it/s, Loss=2.2967] 
Epoch 1 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 461.91it/s]


Epoch 1, F1: 0.1525, Recall: 0.6830, Weighted Score: 0.5238


Epoch 2 Training: 100%|██████████| 8052/8052 [01:25<00:00, 94.19it/s, Loss=2.1168] 
Epoch 2 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 476.35it/s]


Epoch 2, F1: 0.2151, Recall: 0.7133, Weighted Score: 0.5638


Epoch 3 Training: 100%|██████████| 8052/8052 [01:25<00:00, 94.67it/s, Loss=2.9540] 
Epoch 3 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 518.55it/s]


Epoch 3, F1: 0.2041, Recall: 0.7902, Weighted Score: 0.6144


Epoch 4 Training: 100%|██████████| 8052/8052 [01:27<00:00, 92.46it/s, Loss=1.6200]
Epoch 4 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 467.20it/s]


Epoch 4, F1: 0.2337, Recall: 0.7995, Weighted Score: 0.6298


Epoch 5 Training: 100%|██████████| 8052/8052 [01:27<00:00, 91.62it/s, Loss=1.7209] 
Epoch 5 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 503.35it/s]


Epoch 5, F1: 0.2579, Recall: 0.8135, Weighted Score: 0.6468


Epoch 6 Training: 100%|██████████| 8052/8052 [01:24<00:00, 95.14it/s, Loss=1.1272] 
Epoch 6 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 515.97it/s]


Epoch 6, F1: 0.2648, Recall: 0.8298, Weighted Score: 0.6603


Epoch 7 Training: 100%|██████████| 8052/8052 [01:21<00:00, 99.24it/s, Loss=1.1042] 
Epoch 7 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 494.43it/s]


Epoch 7, F1: 0.3186, Recall: 0.8042, Weighted Score: 0.6585


Epoch 8 Training: 100%|██████████| 8052/8052 [01:22<00:00, 97.72it/s, Loss=0.4080] 
Epoch 8 Evaluation: 100%|██████████| 1684/1684 [00:03<00:00, 551.49it/s]


Epoch 8, F1: 0.3287, Recall: 0.7995, Weighted Score: 0.6583
| [39m2        [39m | [39m0.6603   [39m | [39m66.95    [39m | [39m0.1624   [39m | [39m6.75e-05 [39m | [39m2.732    [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 3202/3202 [00:43<00:00, 73.42it/s, Loss=4.9232]
Epoch 1 Evaluation: 100%|██████████| 670/670 [00:02<00:00, 233.91it/s]


Epoch 1, F1: 0.0876, Recall: 0.6620, Weighted Score: 0.4897


Epoch 2 Training: 100%|██████████| 3202/3202 [00:51<00:00, 62.72it/s, Loss=3.8592]
Epoch 2 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 214.76it/s]


Epoch 2, F1: 0.1002, Recall: 0.6667, Weighted Score: 0.4967


Epoch 3 Training: 100%|██████████| 3202/3202 [00:56<00:00, 56.55it/s, Loss=4.9703]
Epoch 3 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 198.15it/s]


Epoch 3, F1: 0.1188, Recall: 0.6643, Weighted Score: 0.5007


Epoch 4 Training: 100%|██████████| 3202/3202 [00:57<00:00, 56.06it/s, Loss=0.9382]
Epoch 4 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 207.46it/s]


Epoch 4, F1: 0.1320, Recall: 0.6550, Weighted Score: 0.4981


Epoch 5 Training: 100%|██████████| 3202/3202 [00:56<00:00, 56.84it/s, Loss=2.4836]
Epoch 5 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 197.00it/s]


Epoch 5, F1: 0.1409, Recall: 0.6667, Weighted Score: 0.5089


Epoch 6 Training: 100%|██████████| 3202/3202 [00:56<00:00, 57.12it/s, Loss=2.9732]
Epoch 6 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 189.26it/s]


Epoch 6, F1: 0.1497, Recall: 0.6876, Weighted Score: 0.5263


Epoch 7 Training: 100%|██████████| 3202/3202 [00:56<00:00, 56.84it/s, Loss=3.8873]
Epoch 7 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 196.77it/s]


Epoch 7, F1: 0.1504, Recall: 0.6853, Weighted Score: 0.5248


Epoch 8 Training: 100%|██████████| 3202/3202 [00:55<00:00, 57.18it/s, Loss=5.2560]
Epoch 8 Evaluation: 100%|██████████| 670/670 [00:03<00:00, 200.26it/s]


Epoch 8, F1: 0.1598, Recall: 0.6946, Weighted Score: 0.5342
| [39m3        [39m | [39m0.5342   [39m | [39m166.6    [39m | [39m0.3832   [39m | [39m3.038e-05[39m | [39m2.94     [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 4582/4582 [00:47<00:00, 97.08it/s, Loss=1.3913] 
Epoch 1 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 370.94it/s]


Epoch 1, F1: 0.2398, Recall: 0.7786, Weighted Score: 0.6169


Epoch 2 Training: 100%|██████████| 4582/4582 [00:48<00:00, 95.42it/s, Loss=2.0474] 
Epoch 2 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 332.97it/s]


Epoch 2, F1: 0.3015, Recall: 0.8228, Weighted Score: 0.6664


Epoch 3 Training: 100%|██████████| 4582/4582 [00:47<00:00, 96.70it/s, Loss=1.1985] 
Epoch 3 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 331.34it/s]


Epoch 3, F1: 0.3399, Recall: 0.8065, Weighted Score: 0.6665


Epoch 4 Training: 100%|██████████| 4582/4582 [00:48<00:00, 94.93it/s, Loss=0.7209] 
Epoch 4 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 348.91it/s]


Epoch 4, F1: 0.4425, Recall: 0.7483, Weighted Score: 0.6565


Epoch 5 Training: 100%|██████████| 4582/4582 [00:48<00:00, 95.35it/s, Loss=0.2454] 
Epoch 5 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 339.97it/s]


Epoch 5, F1: 0.5116, Recall: 0.7179, Weighted Score: 0.6561


Epoch 6 Training: 100%|██████████| 4582/4582 [00:47<00:00, 96.32it/s, Loss=0.9272] 
Epoch 6 Evaluation: 100%|██████████| 959/959 [00:02<00:00, 347.62it/s]


Epoch 6, F1: 0.5427, Recall: 0.7110, Weighted Score: 0.6605
Early stopping at epoch 6
| [39m4        [39m | [39m0.6665   [39m | [39m116.9    [39m | [39m0.3299   [39m | [39m0.0007231[39m | [39m1.432    [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 4703/4703 [01:24<00:00, 55.91it/s, Loss=2.4949]
Epoch 1 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 228.11it/s]


Epoch 1, F1: 0.1835, Recall: 0.7459, Weighted Score: 0.5772


Epoch 2 Training: 100%|██████████| 4703/4703 [01:25<00:00, 54.69it/s, Loss=1.5470]
Epoch 2 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 218.74it/s]


Epoch 2, F1: 0.2305, Recall: 0.8182, Weighted Score: 0.6419


Epoch 3 Training: 100%|██████████| 4703/4703 [01:29<00:00, 52.35it/s, Loss=1.9409]
Epoch 3 Evaluation: 100%|██████████| 984/984 [00:03<00:00, 271.46it/s]


Epoch 3, F1: 0.2789, Recall: 0.8205, Weighted Score: 0.6580


Epoch 4 Training: 100%|██████████| 4703/4703 [01:26<00:00, 54.42it/s, Loss=0.7534]
Epoch 4 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 228.23it/s]


Epoch 4, F1: 0.3814, Recall: 0.7832, Weighted Score: 0.6627


Epoch 5 Training: 100%|██████████| 4703/4703 [01:30<00:00, 51.96it/s, Loss=0.9419]
Epoch 5 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 228.71it/s]


Epoch 5, F1: 0.3840, Recall: 0.8159, Weighted Score: 0.6863


Epoch 6 Training: 100%|██████████| 4703/4703 [01:30<00:00, 51.84it/s, Loss=0.6083]
Epoch 6 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 234.95it/s]


Epoch 6, F1: 0.5060, Recall: 0.7343, Weighted Score: 0.6658


Epoch 7 Training: 100%|██████████| 4703/4703 [01:31<00:00, 51.65it/s, Loss=1.9816]
Epoch 7 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 224.49it/s]


Epoch 7, F1: 0.5115, Recall: 0.8065, Weighted Score: 0.7180


Epoch 8 Training: 100%|██████████| 4703/4703 [01:30<00:00, 51.86it/s, Loss=0.3578]
Epoch 8 Evaluation: 100%|██████████| 984/984 [00:04<00:00, 221.28it/s]


Epoch 8, F1: 0.5686, Recall: 0.7483, Weighted Score: 0.6944
| [35m5        [39m | [35m0.718    [39m | [35m113.7    [39m | [35m0.5      [39m | [35m0.0007709[39m | [35m3.0      [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 4876/4876 [00:50<00:00, 96.67it/s, Loss=1.5741] 
Epoch 1 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 370.51it/s]


Epoch 1, F1: 0.1801, Recall: 0.7016, Weighted Score: 0.5452


Epoch 2 Training: 100%|██████████| 4876/4876 [00:49<00:00, 97.77it/s, Loss=1.5504] 
Epoch 2 Evaluation: 100%|██████████| 1020/1020 [00:03<00:00, 329.27it/s]


Epoch 2, F1: 0.2205, Recall: 0.7646, Weighted Score: 0.6013


Epoch 3 Training: 100%|██████████| 4876/4876 [00:49<00:00, 97.94it/s, Loss=1.4629] 
Epoch 3 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 344.86it/s]


Epoch 3, F1: 0.2471, Recall: 0.8112, Weighted Score: 0.6420


Epoch 4 Training: 100%|██████████| 4876/4876 [00:49<00:00, 97.55it/s, Loss=1.1993] 
Epoch 4 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 366.99it/s]


Epoch 4, F1: 0.3177, Recall: 0.7949, Weighted Score: 0.6517


Epoch 5 Training: 100%|██████████| 4876/4876 [00:49<00:00, 97.56it/s, Loss=2.2487] 
Epoch 5 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 342.22it/s]


Epoch 5, F1: 0.3597, Recall: 0.7949, Weighted Score: 0.6643


Epoch 6 Training: 100%|██████████| 4876/4876 [00:49<00:00, 98.75it/s, Loss=0.3944] 
Epoch 6 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 355.51it/s]


Epoch 6, F1: 0.4123, Recall: 0.7646, Weighted Score: 0.6589


Epoch 7 Training: 100%|██████████| 4876/4876 [00:49<00:00, 98.08it/s, Loss=0.7395] 
Epoch 7 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 347.90it/s]


Epoch 7, F1: 0.4250, Recall: 0.7692, Weighted Score: 0.6660


Epoch 8 Training: 100%|██████████| 4876/4876 [00:49<00:00, 98.11it/s, Loss=1.4588] 
Epoch 8 Evaluation: 100%|██████████| 1020/1020 [00:02<00:00, 357.03it/s]


Epoch 8, F1: 0.5040, Recall: 0.7343, Weighted Score: 0.6652
| [39m6        [39m | [39m0.666    [39m | [39m109.3    [39m | [39m0.5      [39m | [39m0.0003242[39m | [39m1.0      [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 6727/6727 [01:01<00:00, 109.78it/s, Loss=1.2153]
Epoch 1 Evaluation: 100%|██████████| 1407/1407 [00:02<00:00, 546.25it/s]


Epoch 1, F1: 0.2675, Recall: 0.7949, Weighted Score: 0.6366


Epoch 2 Training: 100%|██████████| 6727/6727 [00:58<00:00, 115.91it/s, Loss=1.5564]
Epoch 2 Evaluation: 100%|██████████| 1407/1407 [00:03<00:00, 435.04it/s]


Epoch 2, F1: 0.4111, Recall: 0.7576, Weighted Score: 0.6536


Epoch 3 Training: 100%|██████████| 6727/6727 [01:01<00:00, 109.31it/s, Loss=0.2046]
Epoch 3 Evaluation: 100%|██████████| 1407/1407 [00:02<00:00, 503.82it/s]


Epoch 3, F1: 0.4724, Recall: 0.6993, Weighted Score: 0.6312


Epoch 4 Training: 100%|██████████| 6727/6727 [01:03<00:00, 106.30it/s, Loss=0.3808]
Epoch 4 Evaluation: 100%|██████████| 1407/1407 [00:03<00:00, 406.29it/s]


Epoch 4, F1: 0.5225, Recall: 0.7040, Weighted Score: 0.6495


Epoch 5 Training: 100%|██████████| 6727/6727 [01:06<00:00, 100.92it/s, Loss=0.4488]
Epoch 5 Evaluation: 100%|██████████| 1407/1407 [00:03<00:00, 397.03it/s]


Epoch 5, F1: 0.5543, Recall: 0.7436, Weighted Score: 0.6868


Epoch 6 Training: 100%|██████████| 6727/6727 [01:04<00:00, 103.63it/s, Loss=0.1680]
Epoch 6 Evaluation: 100%|██████████| 1407/1407 [00:02<00:00, 471.22it/s]


Epoch 6, F1: 0.6194, Recall: 0.6713, Weighted Score: 0.6557


Epoch 7 Training: 100%|██████████| 6727/6727 [01:03<00:00, 106.38it/s, Loss=0.1498]
Epoch 7 Evaluation: 100%|██████████| 1407/1407 [00:03<00:00, 448.59it/s]


Epoch 7, F1: 0.6132, Recall: 0.7133, Weighted Score: 0.6833


Epoch 8 Training: 100%|██████████| 6727/6727 [01:02<00:00, 107.01it/s, Loss=0.7288]
Epoch 8 Evaluation: 100%|██████████| 1407/1407 [00:03<00:00, 422.89it/s]


Epoch 8, F1: 0.6571, Recall: 0.6946, Weighted Score: 0.6834
Early stopping at epoch 8
| [39m7        [39m | [39m0.6868   [39m | [39m79.11    [39m | [39m0.263    [39m | [39m0.0009886[39m | [39m1.214    [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 6403/6403 [01:00<00:00, 106.43it/s, Loss=4.2635]
Epoch 1 Evaluation: 100%|██████████| 1340/1340 [00:02<00:00, 468.34it/s]


Epoch 1, F1: 0.1262, Recall: 0.7016, Weighted Score: 0.5290


Epoch 2 Training: 100%|██████████| 6403/6403 [01:00<00:00, 105.75it/s, Loss=2.8038]
Epoch 2 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 419.01it/s]


Epoch 2, F1: 0.1745, Recall: 0.7203, Weighted Score: 0.5566


Epoch 3 Training: 100%|██████████| 6403/6403 [00:57<00:00, 112.32it/s, Loss=4.2301]
Epoch 3 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 415.42it/s]


Epoch 3, F1: 0.2022, Recall: 0.7366, Weighted Score: 0.5763


Epoch 4 Training: 100%|██████████| 6403/6403 [01:00<00:00, 106.35it/s, Loss=1.5589]
Epoch 4 Evaluation: 100%|██████████| 1340/1340 [00:02<00:00, 466.02it/s]


Epoch 4, F1: 0.2166, Recall: 0.7319, Weighted Score: 0.5773


Epoch 5 Training: 100%|██████████| 6403/6403 [01:00<00:00, 105.97it/s, Loss=0.9885]
Epoch 5 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 422.12it/s]


Epoch 5, F1: 0.2315, Recall: 0.7413, Weighted Score: 0.5883


Epoch 6 Training: 100%|██████████| 6403/6403 [00:59<00:00, 107.13it/s, Loss=1.9916]
Epoch 6 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 412.90it/s]


Epoch 6, F1: 0.2335, Recall: 0.7692, Weighted Score: 0.6085


Epoch 7 Training: 100%|██████████| 6403/6403 [01:00<00:00, 106.44it/s, Loss=2.0091]
Epoch 7 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 446.35it/s]


Epoch 7, F1: 0.2335, Recall: 0.7879, Weighted Score: 0.6216


Epoch 8 Training: 100%|██████████| 6403/6403 [01:00<00:00, 106.56it/s, Loss=1.3198]
Epoch 8 Evaluation: 100%|██████████| 1340/1340 [00:03<00:00, 415.78it/s]


Epoch 8, F1: 0.2717, Recall: 0.7762, Weighted Score: 0.6249
| [39m8        [39m | [39m0.6249   [39m | [39m83.93    [39m | [39m0.4312   [39m | [39m7.917e-05[39m | [39m1.727    [39m |
Adjusted 'nhead' to 2 to be divisible by 'd_model' 62


Epoch 1 Training: 100%|██████████| 4703/4703 [00:48<00:00, 96.90it/s, Loss=4.1780] 
Epoch 1 Evaluation: 100%|██████████| 984/984 [00:03<00:00, 322.30it/s]


Epoch 1, F1: 0.0998, Recall: 0.5851, Weighted Score: 0.4395


Epoch 2 Training: 100%|██████████| 4703/4703 [00:48<00:00, 97.22it/s, Loss=3.4397] 
Epoch 2 Evaluation: 100%|██████████| 984/984 [00:02<00:00, 378.38it/s]


Epoch 2, F1: 0.0881, Recall: 0.7016, Weighted Score: 0.5176


Epoch 3 Training: 100%|██████████| 4703/4703 [00:48<00:00, 96.07it/s, Loss=4.1063] 
Epoch 3 Evaluation: 100%|██████████| 984/984 [00:03<00:00, 318.74it/s]


Epoch 3, F1: 0.0905, Recall: 0.6946, Weighted Score: 0.5134


Epoch 4 Training: 100%|██████████| 4703/4703 [00:48<00:00, 96.53it/s, Loss=3.4642] 
Epoch 4 Evaluation: 100%|██████████| 984/984 [00:02<00:00, 345.66it/s]


Epoch 4, F1: 0.0936, Recall: 0.7133, Weighted Score: 0.5274


Epoch 5 Training: 100%|██████████| 4703/4703 [00:48<00:00, 96.60it/s, Loss=2.8267] 
Epoch 5 Evaluation: 100%|██████████| 984/984 [00:02<00:00, 363.29it/s]


Epoch 5, F1: 0.0978, Recall: 0.7016, Weighted Score: 0.5205


Epoch 6 Training: 100%|██████████| 4703/4703 [00:49<00:00, 95.27it/s, Loss=4.0880] 
Epoch 6 Evaluation: 100%|██████████| 984/984 [00:03<00:00, 323.12it/s]


Epoch 6, F1: 0.1012, Recall: 0.6876, Weighted Score: 0.5117


Epoch 7 Training: 100%|██████████| 4703/4703 [00:48<00:00, 96.03it/s, Loss=3.8331] 
Epoch 7 Evaluation: 100%|██████████| 984/984 [00:02<00:00, 337.99it/s]


Epoch 7, F1: 0.1022, Recall: 0.6853, Weighted Score: 0.5104
Early stopping at epoch 7
| [39m9        [39m | [39m0.5274   [39m | [39m113.8    [39m | [39m0.5      [39m | [39m1e-05    [39m | [39m1.0      [39m |
Best parameters: {'batch_size': 113.68655735125606, 'dropout': 0.5, 'learning_rate': 0.0007709479958358504, 'num_layers': 3.0}
Best score (average of F1 and Recall): 0.7180055716641082


## Final Model Training

In [15]:
# Train the final model with the best parameters
best_params = optimizer.max['params']
best_params['num_layers'] = int(best_params['num_layers'])
best_params['batch_size'] = int(best_params['batch_size'])

final_model = TransformerModel(
    num_numerical=num_numerical_features,
    embedding_sizes=embedding_sizes,
    nhead=4,
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
)

optimizer = AdamW(final_model.parameters(), lr=best_params['learning_rate'])
criterion = nn.BCELoss(weight=torch.tensor([15.0]))

train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)

print("Training final model with best parameters:")
for epoch in range(10):  # You can adjust the number of epochs
    final_model.train()
    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training")
    for X_cat_batch, X_num_batch, y_batch in train_pbar:
        optimizer.zero_grad()
        outputs = final_model(X_cat_batch, X_num_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_pbar.set_postfix({'Loss': f'{loss.item():.4f}'})




Adjusted 'nhead' to 2 to be divisible by 'd_model' 62
Training final model with best parameters:


Epoch 1 Training: 100%|██████████| 4703/4703 [01:29<00:00, 52.84it/s, Loss=3.4589]
Epoch 2 Training: 100%|██████████| 4703/4703 [01:31<00:00, 51.42it/s, Loss=1.4430]
Epoch 3 Training: 100%|██████████| 4703/4703 [01:31<00:00, 51.33it/s, Loss=1.8285]
Epoch 4 Training: 100%|██████████| 4703/4703 [01:14<00:00, 63.26it/s, Loss=0.7362]
Epoch 5 Training: 100%|██████████| 4703/4703 [01:15<00:00, 62.24it/s, Loss=0.9194]
Epoch 6 Training: 100%|██████████| 4703/4703 [01:16<00:00, 61.87it/s, Loss=0.3299]
Epoch 7 Training: 100%|██████████| 4703/4703 [01:16<00:00, 61.75it/s, Loss=0.4941]
Epoch 8 Training: 100%|██████████| 4703/4703 [01:16<00:00, 61.77it/s, Loss=0.5017]
Epoch 9 Training: 100%|██████████| 4703/4703 [01:20<00:00, 58.27it/s, Loss=0.5119]
Epoch 10 Training: 100%|██████████| 4703/4703 [01:20<00:00, 58.07it/s, Loss=0.2941]


## Final Evaluation

In [16]:
# Evaluate the final model
final_model.eval()
y_true = []
y_pred = []
test_pbar = tqdm(test_loader, desc="Final Evaluation")
with torch.no_grad():
    for X_cat, X_num, y in test_pbar:
        outputs = final_model(X_cat, X_num)
        predictions = (outputs > 0.5).float()
        y_true.extend(y.tolist())
        y_pred.extend(predictions.flatten().tolist())

f1 = f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
print(f"\nFinal Model Results:")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Average Score: {(f1 + recall) / 2:.4f}")

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Final Evaluation:   0%|          | 0/984 [00:00<?, ?it/s]

Final Evaluation: 100%|██████████| 984/984 [00:04<00:00, 245.93it/s]



Final Model Results:
F1 Score: 0.5554
Recall: 0.7599
Average Score: 0.6576

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    110715
         1.0       0.44      0.76      0.56       429

    accuracy                           1.00    111144
   macro avg       0.72      0.88      0.78    111144
weighted avg       1.00      1.00      1.00    111144


Confusion Matrix:
[[110296    419]
 [   103    326]]
