In [26]:
# %% Importing Libarries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

In [27]:
DATA_DIR = "SAML"

In [28]:
# %% Load your dataset into a pandas DataFrame
df = pd.read_csv(f'{DATA_DIR}\SAML-D.csv')

In [29]:
df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Time                    object 
 1   Date                    object 
 2   Sender_account          int64  
 3   Receiver_account        int64  
 4   Amount                  float64
 5   Payment_currency        object 
 6   Received_currency       object 
 7   Sender_bank_location    object 
 8   Receiver_bank_location  object 
 9   Payment_type            object 
 10  Is_laundering           int64  
 11  Laundering_type         object 
dtypes: float64(1), int64(3), object(8)
memory usage: 870.2+ MB


In [31]:
## Converting date into datetime format

df['Date'] = pd.to_datetime(df['Date'])

# Extract month, day, and week

df['Year'] = pd.to_datetime(df['Date']).dt.year
df['Month'] = pd.to_datetime(df['Date']).dt.month
df['Day'] = pd.to_datetime(df['Date']).dt.day
df['Week'] = df['Date'].dt.isocalendar().week

In [32]:
# Dropping irrelavent columns
df = df.drop(["Date","Time"], axis=1)

In [33]:
df.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits,2022,10,7,40
1,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out,2022,10,7,40
2,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out,2022,10,7,40
3,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In,2022,10,7,40
4,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits,2022,10,7,40


In [34]:
class_distribution = df['Is_laundering'].value_counts().to_numpy()
print(f"Not Fraud = {class_distribution[0]}")
print(f"Fraud = {class_distribution[1]}")

Not Fraud = 9494979
Fraud = 9873


In [35]:
num_cols = df.select_dtypes(exclude="object").columns
print(f"We have {len(num_cols)} numerical columns: {num_cols.tolist()}")

categorical_cols = df.select_dtypes(include="object").columns 
print(f"We have {len(categorical_cols)} categorical columns: {categorical_cols.tolist()}")

We have 8 numerical columns: ['Sender_account', 'Receiver_account', 'Amount', 'Is_laundering', 'Year', 'Month', 'Day', 'Week']
We have 6 categorical columns: ['Payment_currency', 'Received_currency', 'Sender_bank_location', 'Receiver_bank_location', 'Payment_type', 'Laundering_type']


In [36]:
df_c = df.copy(deep=True)
df_c.head(2)

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits,2022,10,7,40
1,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out,2022,10,7,40


In [37]:
unique_counts = df_c[categorical_cols].nunique()
print("Unique columns in the DataFrame: \n", unique_counts)

Unique columns in the DataFrame: 
 Payment_currency          13
Received_currency         13
Sender_bank_location      18
Receiver_bank_location    18
Payment_type               7
Laundering_type           28
dtype: int64


In [44]:
df.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits,2022,10,7,40
1,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out,2022,10,7,40
2,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out,2022,10,7,40
3,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In,2022,10,7,40
4,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits,2022,10,7,40


## Spliting the dataset

In [45]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [50]:
unique_counts = train_df["Is_laundering"].value_counts()
print(unique_counts)

Is_laundering
0    6646428
1       6968
Name: count, dtype: int64


In [51]:
unique_counts = test_df["Is_laundering"].value_counts()
print(unique_counts)

Is_laundering
0    2848551
1       2905
Name: count, dtype: int64


In [53]:
# Create a directed graph for the training data
G_train = nx.DiGraph()

for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0], desc="Building Graph"):
    sender = row['Sender_account']
    receiver = row['Receiver_account']
    amount = row['Amount']
    G_train.add_edge(sender, receiver, amount=amount)

# Create a directed graph for the test data
G_test = nx.DiGraph()

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Building Graph"):
    sender = row['Sender_account']
    receiver = row['Receiver_account']
    amount = row['Amount']
    G_test.add_edge(sender, receiver, amount=amount)

Building Graph: 100%|██████████| 6653396/6653396 [04:53<00:00, 22648.63it/s]
Building Graph: 100%|██████████| 2851456/2851456 [02:10<00:00, 21862.65it/s]


In [54]:
def check_graph_connectivity(graph, name):
    if nx.is_strongly_connected(graph):
        print(f"The {name} graph is strongly connected.")
    else:
        # Check the number of strongly connected components
        scc = list(nx.strongly_connected_components(graph))
        print(f"The {name} graph has {len(scc)} strongly connected components.")
        print(f"The largest strongly connected component in {name} has {len(max(scc, key=len))} nodes.")

In [55]:
# Check connectivity for training and test graphs
check_graph_connectivity(G_train, "Training")
check_graph_connectivity(G_test, "Test")

The Training graph has 799646 strongly connected components.
The largest strongly connected component in Training has 18 nodes.
The Test graph has 758798 strongly connected components.
The largest strongly connected component in Test has 18 nodes.


In [56]:
print(f"Training Data has {train_df.shape[0]} rows.")
print(f"Test Data has {test_df.shape[0]} rows.")
print(f"Training graph has {G_train.number_of_nodes()} nodes and {G_train.number_of_edges()} edges.")
print(f"Test graph has {G_test.number_of_nodes()} nodes and {G_test.number_of_edges()} edges.")

Training Data has 6653396 rows.
Test Data has 2851456 rows.
Training graph has 836781 nodes and 852624 edges.
Test graph has 777393 nodes and 771839 edges.


In [57]:
train_df.to_csv(f'{DATA_DIR}/train_transactions.csv', index=False)
test_df.to_csv(f'{DATA_DIR}/test_transactions.csv', index=False)