# Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import json
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from scipy.stats import loguniform
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Loading the processed Data

In [2]:
X_temp = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/processed/X_train.csv')
X_test = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/processed/X_test.csv')
y_temp = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/processed/y_train.csv')['Class']
y_test = pd.read_csv('/Users/raya/Desktop/fraud-detection/european-dataset/data/processed/y_test.csv')['Class']

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

# OverSampling/UnderSampling

In [4]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [5]:
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)


# Training

In [6]:
datasets = [
    ('original', X_train, y_train),
    ('smote', X_train_smote, y_train_smote),
    ('undersampled', X_train_under, y_train_under)
]
svm = SVC(kernel='rbf', probability=True, random_state=42)
param_dist = {
    'C': loguniform(1e-3, 1e3),  # Regularization parameter
    'gamma': loguniform(1e-4, 1e0),  # Kernel coefficient
    'class_weight': [{0: 1, 1: w} for w in [1, 10, 50, 100, 200]]  # Weight for fraud class
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings to sample
    scoring='f1',  # Optimize for F1-score
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2,
    random_state=42,
)

for dataset_name,feature_set,target_set in datasets:
    print('training model with',dataset_name,'dataset...')
    random_search.fit(feature_set, target_set)
    best_model = random_search.best_estimator_
    print(random_search.best_params_)
    model_name = 'svm_'+dataset_name+'.joblib'
    joblib.dump(best_model, model_name)
    print('model',model_name,'saved.')

training model with original dataset...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
results = []
# Loop through datasets and batch sizes to load training and predict
for dataset_name, _, _ in datasets:
    # Load the saved model
    model_name = 'svm_' + dataset_name + '.joblib'
    model = joblib.load(model_name)
    # Predict on test set
    y_pred = model.predict(X_test)
    # Compute metrics for fraud class (Class 1)
    precision = precision_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    results.append({
        'dataset': dataset_name,
        'precision': precision,
        'f1_score': f1
    })
    print(f"\nMetrics for dataset: {dataset_name}")
    print(f"Precision: {precision:.4f}")
    print(f"F1-Score: {f1:.4f}")