# XGBoost

In [49]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"phuongkhanh21","key":"0fd6eb70e1509aad441adeb76f0dfff2"}'}

In [50]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [51]:
#!kaggle datasets download -d ealaxi/paysim1
!kaggle datasets download -d berkanoztas/synthetic-transaction-monitoring-dataset-aml

Dataset URL: https://www.kaggle.com/datasets/berkanoztas/synthetic-transaction-monitoring-dataset-aml
License(s): CC-BY-NC-SA-4.0
synthetic-transaction-monitoring-dataset-aml.zip: Skipping, found more recently modified local copy (use --force to force download)


In [52]:
import zipfile

with zipfile.ZipFile("synthetic-transaction-monitoring-dataset-aml.zip", 'r') as zip_ref:
    zip_ref.extractall("synthetic_transaction_data")


In [53]:
#! [ -e /content ] && pip install -Uqq fastbook
#import fastbook
#fastbook.setup_book()

In [54]:
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split

In [55]:
import pandas as pd
os.listdir("synthetic_transaction_data")

['SAML-D.csv']

In [56]:
df = pd.read_csv("synthetic_transaction_data/SAML-D.csv")
df.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Time                    object 
 1   Date                    object 
 2   Sender_account          int64  
 3   Receiver_account        int64  
 4   Amount                  float64
 5   Payment_currency        object 
 6   Received_currency       object 
 7   Sender_bank_location    object 
 8   Receiver_bank_location  object 
 9   Payment_type            object 
 10  Is_laundering           int64  
 11  Laundering_type         object 
dtypes: float64(1), int64(3), object(8)
memory usage: 870.2+ MB


In [58]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   Time                    object        
 1   Date                    datetime64[ns]
 2   Sender_account          int64         
 3   Receiver_account        int64         
 4   Amount                  float64       
 5   Payment_currency        object        
 6   Received_currency       object        
 7   Sender_bank_location    object        
 8   Receiver_bank_location  object        
 9   Payment_type            object        
 10  Is_laundering           int64         
 11  Laundering_type         object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 870.2+ MB


In [59]:
#df.isFraud.value_counts()
df.Is_laundering.value_counts()

Unnamed: 0_level_0,count
Is_laundering,Unnamed: 1_level_1
0,9494979
1,9873


In [60]:
df_train, df_test = train_test_split(df.copy(),
                                    random_state = 7987,
                                    shuffle = True,
                                    test_size = 0.2,
                                     stratify=df.Is_laundering)

In [61]:
df_train = df_train.drop(columns = ["Laundering_type"])

# Feature Engineering

From the paper "Explainable Feature Engineering for
Multi-class Money Laundering Classification", I want to include the following features:

*   fanin_30d: month‑on‑month count of unique incoming sources; the single most important feature.
*   fan_in_out_ratio: ratio of inbound to outbound counterparties over 30 days.
*   fanin_intensity_ratio: fanin_30d normalized by daily received transactions (inbound concentration).
*   amount_dispersion_std: standard deviation of amounts per sender (amount volatility).
*   sent_to_received_ratio_monthly: sum received / sum sent in a month (balances that trend toward 1 are suspicious).








fanin_30d: Number of unique sender accounts that sent money to a given receiver in the past 30 days.

In [62]:
df_train['fanin_30d'] = df_train.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].transform('nunique')

In [63]:
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1


In [64]:
df_train['fanout_30d'] = df_train.groupby(['Sender_account', pd.Grouper(key='Date', freq='30D')])['Receiver_account'].transform('nunique')
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8


In [65]:
df_train['fan_in_out_ratio'] = df_train['fanin_30d']/df_train['fanout_30d']
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125


In [66]:
df_train['daily_receive'] = df_train.groupby(['Receiver_account', pd.Grouper(key='Date', freq='1D')])['Sender_account'].transform('nunique')
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1


In [67]:
df_train['fanin_intensity_ratio'] = df_train['fanin_30d']/df_train['daily_receive']
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive,fanin_intensity_ratio
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16,1.0625
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1,1.0
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15,1.4
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17,1.058824
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1,1.0


In [68]:
df_train['amount_dispersion_std'] = df_train.groupby(['Sender_account'])['Amount'].transform(np.std)
df_train.head(5)

  df_train['amount_dispersion_std'] = df_train.groupby(['Sender_account'])['Amount'].transform(np.std)


Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive,fanin_intensity_ratio,amount_dispersion_std
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16,1.0625,86.664544
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1,1.0,21313.723395
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15,1.4,44.076644
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17,1.058824,105.876319
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1,1.0,10384.035302


In [69]:
df_train['monthly_receive'] = df_train.groupby(['Receiver_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)
df_train.head(5)

  df_train['monthly_receive'] = df_train.groupby(['Receiver_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)


Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive,fanin_intensity_ratio,amount_dispersion_std,monthly_receive
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16,1.0625,86.664544,1860911.82
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1,1.0,21313.723395,486.41
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15,1.4,44.076644,1497376.43
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17,1.058824,105.876319,2104357.4
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1,1.0,10384.035302,9253.62


In [70]:
df_train['monthly_send'] = df_train.groupby(['Sender_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)
df_train.head(5)

  df_train['monthly_send'] = df_train.groupby(['Sender_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)


Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive,fanin_intensity_ratio,amount_dispersion_std,monthly_receive,monthly_send
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16,1.0625,86.664544,1860911.82,91743.16
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1,1.0,21313.723395,486.41,230491.9
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15,1.4,44.076644,1497376.43,89899.75
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17,1.058824,105.876319,2104357.4,130968.86
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1,1.0,10384.035302,9253.62,114707.84


In [71]:
df_train['sent_to_received_ratio_monthly'] = df_train['monthly_receive']/df_train['monthly_send']
df_train.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,fanin_30d,fanout_30d,fan_in_out_ratio,daily_receive,fanin_intensity_ratio,amount_dispersion_std,monthly_receive,monthly_send,sent_to_received_ratio_monthly
8616887,22:11:59,2023-07-23,4952059985,5332988211,15319.88,UK pounds,UK pounds,UK,UK,Cheque,0,17,1,17.0,16,1.0625,86.664544,1860911.82,91743.16,20.28393
2249901,06:17:22,2022-12-23,3500851133,8272309842,151.62,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,1,13,0.076923,1,1.0,21313.723395,486.41,230491.9,0.00211
4167658,10:49:32,2023-02-24,569180581,6476703638,11308.99,UK pounds,UK pounds,UK,UK,ACH,0,21,1,21.0,15,1.4,44.076644,1497376.43,89899.75,16.656069
3459895,11:07:12,2023-02-02,4581676574,3161644617,14699.69,UK pounds,UK pounds,UK,UK,Credit card,0,18,1,18.0,17,1.058824,105.876319,2104357.4,130968.86,16.067616
7335943,21:16:33,2023-06-10,9229366448,8584204471,9253.62,UK pounds,UK pounds,UK,UK,Credit card,0,1,8,0.125,1,1.0,10384.035302,9253.62,114707.84,0.080671


In [72]:
'''
def feature_engineer(df):

  # fanin_30d: Number of unique sender accounts that sent money to a given receiver in the past 30 days.
  df['fanin_30d'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].nunique()

  # fan_in_out_ratio: For each account, the number of unique inbound counterparties divided by the number of unique outbound counterparties in a 30-day window.
  in_bound = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].nunique()
  out_bound = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='30D')])['Receiver_account'].nunique()
  df['fan_in_out_ratio'] = in_bound/out_bound

  # fanin_intensity_ratio: Measures concentration—how many unique senders per daily inbound transaction.
  daily_receive = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='1D')])['Sender_account'].nunique()
  df['fanin_intensity_ratio'] = in_bound/daily_receive

  # amount_dispersion_std: Volatility of transaction amounts sent by each sender.
  df['amount_dispersion_std'] = df.groupby(['Sender_account'])['Amount'].std()

  # sent_to_received_ratio_monthly: For each account, total received amount divided by total sent amount over a monthly window.
  received = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='1M')])['Amount'].sum()
  sent = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='1M')])['Amount'].sum()
  df['sent_to_received_ratio_monthly'] = received/sent

  return df
  '''


"\ndef feature_engineer(df):\n\n  # fanin_30d: Number of unique sender accounts that sent money to a given receiver in the past 30 days.\n  df['fanin_30d'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].nunique()\n\n  # fan_in_out_ratio: For each account, the number of unique inbound counterparties divided by the number of unique outbound counterparties in a 30-day window.\n  in_bound = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].nunique()\n  out_bound = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='30D')])['Receiver_account'].nunique()\n  df['fan_in_out_ratio'] = in_bound/out_bound\n\n  # fanin_intensity_ratio: Measures concentration—how many unique senders per daily inbound transaction.\n  daily_receive = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='1D')])['Sender_account'].nunique()\n  df['fanin_intensity_ratio'] = in_bound/daily_receive\n\n  # amount_dispersion_std: Vol

# Applying XG Boost

In [73]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [74]:
features = ['fanin_30d', 'fan_in_out_ratio', 'fanin_intensity_ratio', 'amount_dispersion_std', 'sent_to_received_ratio_monthly']

In [75]:
data_dmatrix = xgb.DMatrix(data=df_train[features],label=df_train.Is_laundering)
params = {'objective':'binary:logistic','eval_metric':'logloss',
          'eta':0.01,
          'subsample':0.1}
xgb_cv = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5, metrics = 'logloss',seed=42)

In [76]:
xgb_cv

Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.007391,4.3e-05,0.007393,0.000132
1,0.006924,3.3e-05,0.006927,0.000129
2,0.006487,2.6e-05,0.00649,0.000118
3,0.006221,2.8e-05,0.006224,0.000108
4,0.006039,3e-05,0.006044,0.000102
5,0.005892,2.8e-05,0.005897,0.000101
6,0.005767,2.9e-05,0.005773,9.8e-05
7,0.005653,2.8e-05,0.005659,9.7e-05
8,0.005553,2.8e-05,0.005559,9.4e-05
9,0.005464,2.8e-05,0.00547,9.2e-05


In [77]:
grid = pd.DataFrame({'eta':[0.01,0.05,0.1]*2,
                  'subsample':np.repeat([0.1,0.3],3)})

In [78]:
def fit(x):
    params = {'objective':'binary:logistic',
              'eval_metric':'logloss',
              'eta':x[0],
              'subsample':x[1]}
    xgb_cv = xgb.cv(dtrain=data_dmatrix, params=params,
    nfold=5, metrics = 'logloss',seed=42)
    return xgb_cv[-1:].values[0]

grid[['train-logloss-mean','train-logloss-std',
'test-logloss-mean','test-logloss-std']] = grid.apply(fit,axis=1,result_type='expand')

  'eta':x[0],
  'subsample':x[1]}


In [79]:
grid

Unnamed: 0,eta,subsample,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.01,0.1,0.005464,2.8e-05,0.00547,9.2e-05
1,0.05,0.1,0.004183,3e-05,0.004196,7.3e-05
2,0.1,0.1,0.003862,0.000417,0.003892,0.000444
3,0.01,0.3,0.005225,2.5e-05,0.005231,9.3e-05
4,0.05,0.3,0.003783,3.4e-05,0.003799,6.8e-05
5,0.1,0.3,0.003864,0.000257,0.003952,0.000264


We can see that eta = 0.10 and subsample = 0.3 yields the best result. We will apply the model using these parameters.

In [80]:
xgb_class = xgb.XGBClassifier(objective='binary:logistic',
                           eval_metric = 'logloss',
                           eta = 0.1,
                           subsample = 0.3)

xgb_class.fit(df_train[features], df_train.Is_laundering)

In [81]:
def feature_engineer(df):

  # dropping column Laundering_type
  df = df.drop(columns = ["Laundering_type"])

  # fanin_30d: Number of unique sender accounts that sent money to a given receiver in the past 30 days.
  df['fanin_30d'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='30D')])['Sender_account'].transform('nunique')

  # fan_in_out_ratio: For each account, the number of unique inbound counterparties divided by the number of unique outbound counterparties in a 30-day window.
  df['fanout_30d'] = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='30D')])['Receiver_account'].transform('nunique')
  df['fan_in_out_ratio'] = df['fanin_30d']/df['fanout_30d']

  # fanin_intensity_ratio: Measures concentration—how many unique senders per daily inbound transaction.
  df['daily_receive'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='1D')])['Sender_account'].transform('nunique')
  df['fanin_intensity_ratio'] = df['fanin_30d']/df['daily_receive']

  # amount_dispersion_std: Volatility of transaction amounts sent by each sender.
  df['amount_dispersion_std'] = df.groupby(['Sender_account'])['Amount'].transform(np.std)

  # sent_to_received_ratio_monthly: For each account, total received amount divided by total sent amount over a monthly window.
  df['monthly_receive'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)
  df['monthly_send'] = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)
  df['sent_to_received_ratio_monthly'] = df['monthly_receive']/df['monthly_send']

  return df


In [82]:
df_test.head(5)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
3420681,08:58:45,2023-02-01,5681621631,673472662,6587.7,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Fan_Out
6466567,13:27:22,2023-05-12,8884425196,4233653992,12279.23,Naira,UK pounds,Nigeria,UK,Cross-border,0,Normal_Fan_In
6832606,21:16:28,2023-05-24,9444716063,4965338487,6279.17,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_Out
2281614,00:38:04,2022-12-24,7816913621,8783956972,174.8,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,Normal_Cash_Withdrawal
7723521,17:51:47,2023-06-23,7545250325,678613457,4063.74,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [85]:
df_tes = feature_engineer(df_test)
df_test.head(5)

  df['amount_dispersion_std'] = df.groupby(['Sender_account'])['Amount'].transform(np.std)
  df['monthly_receive'] = df.groupby(['Receiver_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)
  df['monthly_send'] = df.groupby(['Sender_account', pd.Grouper(key='Date', freq='ME')])['Amount'].transform(sum)


Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
3420681,08:58:45,2023-02-01,5681621631,673472662,6587.7,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Fan_Out
6466567,13:27:22,2023-05-12,8884425196,4233653992,12279.23,Naira,UK pounds,Nigeria,UK,Cross-border,0,Normal_Fan_In
6832606,21:16:28,2023-05-24,9444716063,4965338487,6279.17,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_Out
2281614,00:38:04,2022-12-24,7816913621,8783956972,174.8,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,Normal_Cash_Withdrawal
7723521,17:51:47,2023-06-23,7545250325,678613457,4063.74,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [84]:
pred = xgb_class.predict(df_test[features])

KeyError: "None of [Index(['fanin_30d', 'fan_in_out_ratio', 'fanin_intensity_ratio',\n       'amount_dispersion_std', 'sent_to_received_ratio_monthly'],\n      dtype='object')] are in the [columns]"

In [None]:
cm = confusion_matrix(df_test.Is_laundering, pred)
cm