# Credit Card Fraud Detection - Data Modeling

In this notebook, we will build machine learning models to detect fraudulent transactions in the credit card dataset. The goal is to evaluate several models and choose the one with the best performance in identifying fraud, given the highly imbalanced nature of the dataset.


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the preprocessed dataset
data = pd.read_csv('../data/raw/creditcard.csv')

# Split features and labels
X = data.drop('Class', axis=1)
y = data['Class']

# Scale the 'Amount' and 'Time' features
scaler = StandardScaler()
X[['scaled_amount', 'scaled_time']] = scaler.fit_transform(X[['Amount', 'Time']])
X = X.drop(['Amount', 'Time'], axis=1)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [4]:
X_resampled

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,scaled_amount,scaled_time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,-1.996583
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,-0.342475,-1.996583
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,-1.996562
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,-1.996562
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,-0.073403,-1.996541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568625,-0.125056,2.797045,-6.436962,3.250654,-1.673880,-2.737504,-2.301424,0.969237,-1.900690,-4.937166,...,0.647769,0.129735,0.201975,0.005538,-0.168130,0.569294,0.156908,-0.102253,0.051029,1.084762
568626,-3.352917,0.753401,-1.698278,0.863169,-1.186314,-0.406322,-1.652498,0.020940,-0.018209,-2.696694,...,0.342796,0.454379,-0.130009,-0.499223,-0.042935,0.987288,-1.389017,0.750979,0.052576,-0.637311
568627,0.341900,1.691419,-1.741040,3.690779,-0.817104,-0.767287,-2.054896,0.551213,-2.026584,-2.605154,...,0.285721,-0.473509,0.007435,-0.078866,0.455098,-0.052333,0.542619,0.293170,-0.336907,-1.301187
568628,-1.269532,4.185133,-6.113043,5.037080,1.080631,-2.122858,-1.298991,0.575999,-3.684205,-6.215259,...,0.330215,-0.863163,-0.292751,-0.297741,0.296018,-0.013180,0.780460,0.397597,-0.126700,1.060474


In [5]:
y_resampled

0         0
1         0
2         0
3         0
4         0
         ..
568625    1
568626    1
568627    1
568628    1
568629    1
Name: Class, Length: 568630, dtype: int64

In [9]:
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Start an MLflow run
with mlflow.start_run():

    # Train Logistic Regression model
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    
    # Make predictions
    y_pred = log_reg.predict(X_test)
    
    # Evaluate the model
    acc = accuracy_score(y_test, y_pred)
    clf_report = classification_report(y_test, y_pred, output_dict=True)

    # Log model parameters
    mlflow.log_param("model_type", "LogisticRegression")
    
    # Log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", clf_report['1']['precision'])
    mlflow.log_metric("recall", clf_report['1']['recall'])
    mlflow.log_metric("f1_score", clf_report['1']['f1-score'])
    
    # Log the model itself
    mlflow.sklearn.log_model(log_reg, "logistic_regression_model")

    # Print the classification report
    print(classification_report(y_test, y_pred))

    # End MLflow run
mlflow.end_run()



              precision    recall  f1-score   support

           0       0.93      0.98      0.95     56750
           1       0.97      0.92      0.95     56976

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726



In [7]:
from sklearn.ensemble import RandomForestClassifier

# Start an MLflow run for Random Forest
with mlflow.start_run():

    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluate the model
    acc_rf = accuracy_score(y_test, y_pred_rf)
    clf_report_rf = classification_report(y_test, y_pred_rf, output_dict=True)

    # Log model parameters
    mlflow.log_param("model_type", "RandomForest")
    
    # Log metrics
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.log_metric("precision", clf_report_rf['1']['precision'])
    mlflow.log_metric("recall", clf_report_rf['1']['recall'])
    mlflow.log_metric("f1_score", clf_report_rf['1']['f1-score'])
    
    # Log the model itself
    mlflow.sklearn.log_model(rf_model, "random_forest_model")

    # Print the classification report
    print(classification_report(y_test, y_pred_rf))

    # End MLflow run
mlflow.end_run()



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56750
           1       1.00      1.00      1.00     56976

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726



In [8]:
import xgboost as xgb

# Start an MLflow run for XGBoost
with mlflow.start_run():

    # Train XGBoost model
    xgb_model = xgb.XGBClassifier(random_state=42)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_xgb = xgb_model.predict(X_test)
    
    # Evaluate the model
    acc_xgb = accuracy_score(y_test, y_pred_xgb)
    clf_report_xgb = classification_report(y_test, y_pred_xgb, output_dict=True)

    # Log model parameters
    mlflow.log_param("model_type", "XGBoost")
    
    # Log metrics
    mlflow.log_metric("accuracy", acc_xgb)
    mlflow.log_metric("precision", clf_report_xgb['1']['precision'])
    mlflow.log_metric("recall", clf_report_xgb['1']['recall'])
    mlflow.log_metric("f1_score", clf_report_xgb['1']['f1-score'])
    
    # Log the model itself
    mlflow.sklearn.log_model(xgb_model, "xgboost_model")

    # Print the classification report
    print(classification_report(y_test, y_pred_xgb))

    # End MLflow run
mlflow.end_run()



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56750
           1       1.00      1.00      1.00     56976

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726

