Anti-Money Laundering (AML) Detection System

In [None]:
!pip install xgboost lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
file_path = "HI-Small_Trans.csv"
data = pd.read_csv(file_path)

In [None]:
categorical_columns = ['Payment Format', 'Receiving Currency', 'Payment Currency']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [None]:
X = data.drop(columns=['Is Laundering', 'Timestamp','Account','Account.1'])
y = data['Is Laundering']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
xgb_model = xgb.XGBClassifier(
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

In [None]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

In [None]:
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred))
print("XGBoost ROC AUC Score:", roc_auc_score(y_test, y_prob))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95   1014634
           1       0.01      0.87      0.02      1035

    accuracy                           0.91   1015669
   macro avg       0.50      0.89      0.49   1015669
weighted avg       1.00      0.91      0.95   1015669

XGBoost ROC AUC Score: 0.9673586703199866


In [None]:
lgb_model = lgb.LGBMClassifier(
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),  # Handle imbalance
    objective='binary',
    metric='auc',
    random_state=42
)


In [None]:
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 4142, number of negative: 4058534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.256247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088
[LightGBM] [Info] Number of data points in the train set: 4062676, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001020 -> initscore=-6.887398
[LightGBM] [Info] Start training from score -6.887398


In [None]:
y_pred = lgb_model.predict(X_test)
y_prob = lgb_model.predict_proba(X_test)[:, 1]

In [None]:
print("LightGBM Classification Report:\n", classification_report(y_test, y_pred))
print("LightGBM ROC AUC Score:", roc_auc_score(y_test, y_prob))

LightGBM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.55      0.71   1014634
           1       0.00      0.73      0.00      1035

    accuracy                           0.55   1015669
   macro avg       0.50      0.64      0.36   1015669
weighted avg       1.00      0.55      0.71   1015669

LightGBM ROC AUC Score: 0.6386050422179792


In [None]:
cat_model = CatBoostClassifier(
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),  # Handle imbalance
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=42,
    verbose=200
)

In [None]:
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)

Learning rate set to 0.246766
0:	test: 0.9226456	best: 0.9226456 (0)	total: 1.12s	remaining: 18m 39s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9685506144
bestIteration = 76

Shrink model to first 77 iterations.


<catboost.core.CatBoostClassifier at 0x79eb84231610>

In [None]:
y_pred = cat_model.predict(X_test)
y_prob = cat_model.predict_proba(X_test)[:, 1]

In [None]:
print("CatBoost Classification Report:\n", classification_report(y_test, y_pred))
print("CatBoost ROC AUC Score:", roc_auc_score(y_test, y_prob))

CatBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.87      0.93   1014634
           1       0.01      0.94      0.01      1035

    accuracy                           0.87   1015669
   macro avg       0.50      0.90      0.47   1015669
weighted avg       1.00      0.87      0.93   1015669

CatBoost ROC AUC Score: 0.9685506143673197
