In [139]:
"""
Advanced High-Risk Account Detection for Anti-Money Laundering
Uses Graph Neural Networks and Behavioral Feature Engineering
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, auc, roc_curve
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool, SAGEConv, global_mean_pool
from torch_geometric.data import NeighborSampler
from torch_geometric.loader import NeighborLoader
from torch_geometric.data import Data, DataLoader as PyGDataLoader
import networkx as nx
from collections import defaultdict
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

In [140]:
from tqdm import tqdm
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))  # Adjust as needed
from config import DATAPATH, SAMPLE_DATAPATH

In [141]:
# Load the entire dataset
df = pd.read_csv(DATAPATH)

# Filter by data range
# df = df[df['Date'] <= '2022-12-31']

In [142]:
df.tail()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
9504847,10:57:01,2023-08-23,2453933570,519744068,2247.25,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Small_Fan_Out
9504848,10:57:06,2023-08-23,9805510177,5416607878,927.18,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Small_Fan_Out
9504849,10:57:06,2023-08-23,7282330957,2995527149,1455.14,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Small_Fan_Out
9504850,10:57:11,2023-08-23,940337377,4812815165,25995.7,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
9504851,10:57:12,2023-08-23,105185176,6824994831,9586.08,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_Out


In [130]:
# Add and delete columns
df.insert(0, 'datetime', pd.to_datetime(df["Date"] + ' ' + df["Time"], format='%Y-%m-%d %H:%M:%S'))

# df.drop(columns=['Laundering_type'], inplace=True)
df.drop(columns=['Time', 'Date'], inplace=True)

print("\nDate range:")
print(f"From: {df['datetime'].min()}")
print(f"To: {df['datetime'].max()}")


Date range:
From: 2022-10-07 10:35:19
To: 2023-08-23 10:57:12


#### Account level stats (7 Days window)

1. Number of sent transactions (sent_txns)
2. Number of received transactions (recv_txns)
3. Ratio of sent to received transactions (sent_recv_ratio)
4. Median Sent transaction amount (med_sent_amt)
5. Median Received transaction amount (med_recv_amt)
6. Standard Deviation of sent transaction amount (std_sent_amt)
7. Standard Deviation of received transaction amount (std_recv_amt)
8. Total transaction amount (total_txn_amt)
9. Number of unique counterparties interacted with (unique_counterparties)
10. Number of unique sent accounts (fanout_7d)
11. Number of unique received accounts (fanin_7d)
12. Fanout to fanin ratio (fanout_fanin_ratio)
13. Max transaction count to a single counterparty (max_txn_to_counterparty)
14. Max transaction amount to a single counterparty (max_amt_to_counterparty)
15. Maximum transaction count in a single day (max_txn_count_1d)
16. Maximum transaction amount in a single day (max_txn_amt_1d)

In [134]:
start_date = df['datetime'].min().date()
# end_date = sent_txns['datetime'].max().date()
end_date = (df['datetime'].max() + pd.Timedelta(days=1)).date()

print(f"Data spans from {start_date} to {end_date} ({(end_date - start_date).days} days)")

Data spans from 2022-10-07 to 2023-08-24 (321 days)


In [135]:
account_stats_7D = pd.DataFrame({})

for current_window in pd.date_range(start=start_date, end=end_date, freq='7D'):
    window_start = current_window
    window_end = current_window + pd.Timedelta(days=7)
    print(f"Processing window: {window_start} to {window_end}")
    # Get transactions for this window
    window_txns = df[(df['datetime'] >= window_start) & (df['datetime'] < window_end)]

    # Active accounts in this window
    active_accounts = list(set(window_txns['Sender_account'].unique()).union(set(window_txns['Receiver_account'].unique())))

    sent_txns = window_txns.groupby(['Sender_account']).agg({
        'Receiver_account': ['size', 'nunique'],
        'Amount': ['median', 'std', 'sum']
    }).reset_index()
    sent_txns.columns = ['account', 'sent_txns_count', 'fan_out', 'med_sent_amt', 'std_sent_amt', 'total_sent_amt']
    received_txns = window_txns.groupby(['Receiver_account']).agg({
        'Sender_account': ['size', 'nunique'],
        'Amount': ['median', 'std', 'sum']
    }).reset_index()
    received_txns.columns = ['account', 'recv_txns_count', 'fan_in', 'med_recv_amt', 'std_recv_amt', 'total_recv_amt']

    # Max transaction between accounts
    sent_recv_txns = window_txns.groupby(['Sender_account', 
                                 'Receiver_account']).agg({
                                     'Amount': ['count', 'sum']
                                 }).reset_index()
    sent_recv_txns.columns = ['Sender_account', 'Receiver_account', 'sent_txn_count', 'sent_txn_amount']
    max_sent_txns = sent_recv_txns.groupby('Sender_account').agg({
        'sent_txn_count': 'max',
        'sent_txn_amount': 'max'
    }).reset_index()
    max_sent_txns.columns = ['account', 'max_sent_txn_count', 'max_sent_txn_amt']

    recv_sent_txns = window_txns.groupby(['Receiver_account', 
                                 'Sender_account']).agg({
                                     'Amount': ['count', 'sum']
                                 }).reset_index()
    recv_sent_txns.columns = ['Receiver_account', 'Sender_account', 'recv_txn_count', 'recv_txn_amount']
    max_recv_txns = recv_sent_txns.groupby('Receiver_account').agg({
        'recv_txn_count': 'max',
        'recv_txn_amount': 'max'
    }).reset_index()
    max_recv_txns.columns = ['account', 'max_recv_txn_count', 'max_recv_txn_amt']
    
    # Initialize window dataframe
    window_data = pd.DataFrame({
        'window_start': window_start,
        'account': active_accounts})
    window_data = window_data.merge(sent_txns, on='account', how='left') # sent_txns_count
    window_data = window_data.merge(received_txns, on='account', how='left') # received_txns_count
    window_data = window_data.merge(max_sent_txns, on='account', how='left')
    window_data = window_data.merge(max_recv_txns, on='account', how='left')
    window_data.fillna(0, inplace=True)

    # Calculated
    window_data['sent_recv_ratio'] = window_data.apply(
        lambda r: r['sent_txns_count'] / r['recv_txns_count'] if r['recv_txns_count'] > 0 else -1, axis=1
    )

    window_data['fanout_fanin_ratio'] = window_data.apply(
        lambda r: r['fan_out'] / r['fan_in'] if r['fan_in'] > 0 else -1, axis=1
    )

    window_data['total_txns_amt'] = window_data['total_sent_amt'] + window_data['total_recv_amt']
    del window_data['total_sent_amt']
    del window_data['total_recv_amt']
    

    # Append to main dataframe
    account_stats_7D = pd.concat([account_stats_7D, window_data], ignore_index=True)

Processing window: 2022-10-07 00:00:00 to 2022-10-14 00:00:00
Processing window: 2022-10-14 00:00:00 to 2022-10-21 00:00:00
Processing window: 2022-10-21 00:00:00 to 2022-10-28 00:00:00
Processing window: 2022-10-28 00:00:00 to 2022-11-04 00:00:00
Processing window: 2022-11-04 00:00:00 to 2022-11-11 00:00:00
Processing window: 2022-11-11 00:00:00 to 2022-11-18 00:00:00
Processing window: 2022-11-18 00:00:00 to 2022-11-25 00:00:00
Processing window: 2022-11-25 00:00:00 to 2022-12-02 00:00:00
Processing window: 2022-12-02 00:00:00 to 2022-12-09 00:00:00
Processing window: 2022-12-09 00:00:00 to 2022-12-16 00:00:00
Processing window: 2022-12-16 00:00:00 to 2022-12-23 00:00:00
Processing window: 2022-12-23 00:00:00 to 2022-12-30 00:00:00
Processing window: 2022-12-30 00:00:00 to 2023-01-06 00:00:00
Processing window: 2023-01-06 00:00:00 to 2023-01-13 00:00:00
Processing window: 2023-01-13 00:00:00 to 2023-01-20 00:00:00
Processing window: 2023-01-20 00:00:00 to 2023-01-27 00:00:00
Processi

In [136]:
account_stats_7D.to_csv('account_stats_7D.csv', index=False)

In [138]:
account_stats_7D.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5210998 entries, 0 to 5210997
Data columns (total 17 columns):
 #   Column              Dtype         
---  ------              -----         
 0   window_start        datetime64[ns]
 1   account             int64         
 2   sent_txns_count     float64       
 3   fan_out             float64       
 4   med_sent_amt        float64       
 5   std_sent_amt        float64       
 6   recv_txns_count     float64       
 7   fan_in              float64       
 8   med_recv_amt        float64       
 9   std_recv_amt        float64       
 10  max_sent_txn_count  float64       
 11  max_sent_txn_amt    float64       
 12  max_recv_txn_count  float64       
 13  max_recv_txn_amt    float64       
 14  sent_recv_ratio     float64       
 15  fanout_fanin_ratio  float64       
 16  total_txns_amt      float64       
dtypes: datetime64[ns](1), float64(15), int64(1)
memory usage: 675.9 MB
