In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer

Timestamp Handling and Related Feature Engineering

In [36]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

In [37]:
train_df.columns

Index(['Unnamed: 0', 'Time', 'Date', 'Sender_account', 'Receiver_account',
       'Amount', 'Payment_currency', 'Received_currency',
       'Sender_bank_location', 'Receiver_bank_location', 'Payment_type',
       'Is_laundering', 'Laundering_type'],
      dtype='object')

In [38]:
categorical_cols = ['Sender_account', 'Receiver_account',
                    'Payment_currency', 'Received_currency',
                    'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']
continuous_cols = ["Amount"]
target_col = 'Is_laundering'

In [39]:
for df in [train_df, val_df, test_df]:

    df['Time'] = df['Time'].astype(str).str.replace(r'0 days\s+', '', regex=True)

    df['DateTime'] = df['Date'] + ' ' + df['Time']


    df['DateTime'] = pd.to_datetime(df['DateTime'], format='%Y-%m-%d %H:%M:%S')

    df['Timestamp'] = df['DateTime'].astype('int64') // 10**9  # seconds since epoch

    df['hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['Second'] = df['DateTime'].dt.second
    df['day_of_week'] = df['DateTime'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5

    df['day'] = df['DateTime'].dt.day
    df['year'] = df['DateTime'].dt.year

    df['month'] = df['DateTime'].dt.month

    df['week'] = df['DateTime'].dt.isocalendar().week

In [40]:
train_df.Sender_account.value_counts()

Sender_account
4109275762    636
798082205     630
1298080955    612
5893741140    610
8035399972    609
             ... 
9532476954      1
5544146952      1
708532896       1
2931451038      1
7381575416      1
Name: count, Length: 245891, dtype: int64

Label encoding for categorical variables

In [41]:
encoders = {}  # store encoders for reuse
categorical_cols = ['Sender_account', 'Receiver_account',
                    'Payment_currency', 'Received_currency',
                    'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']
continuous_cols = ["Amount"]
target_col = 'Is_laundering'

for col in categorical_cols:
    le = LabelEncoder()
    le.fit(train_df[col])

    def safe_transform(df_col):
        # Replace unseen categories with a placeholder (here, just pick a known one)
        known_classes = set(le.classes_)
        return df_col.map(lambda x: x if x in known_classes else le.classes_[0])

    # Transform train/val/test
    train_df[col] = le.transform(train_df[col])
    val_df[col] = le.transform(safe_transform(val_df[col]))
    test_df[col] = le.transform(safe_transform(test_df[col]))

    encoders[col] = le


In [46]:
train_df.Sender_account.describe()

count    7.429619e+06
mean     1.232359e+05
std      7.083507e+04
min      0.000000e+00
25%      6.226700e+04
50%      1.231110e+05
75%      1.845330e+05
max      2.458900e+05
Name: Sender_account, dtype: float64

Log Transform of `Amount` to reduce skewness (just standardize the `Amount` here)

In [43]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)

train_df['amount_power'] = pt.fit_transform(train_df[['Amount']])
val_df['amount_power'] = pt.transform(val_df[['Amount']])
test_df['amount_power'] = pt.transform(test_df[['Amount']])

In [44]:
train_df.head(5)

Unnamed: 0.1,Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,...,hour,Minute,Second,day_of_week,is_weekend,day,year,month,week,amount_power
0,0,10:35:19,2022-10-07,214498,162977,1459.15,10,10,16,16,...,10,35,19,4,False,7,2022,10,40,-0.880694
1,1,10:35:20,2022-10-07,36586,495398,6019.64,10,1,16,15,...,10,35,20,4,False,7,2022,10,40,0.134756
2,2,10:35:20,2022-10-07,7222,259992,14328.44,10,10,16,16,...,10,35,20,4,False,7,2022,10,40,0.92619
3,3,10:35:21,2022-10-07,132152,566042,11895.0,10,10,16,16,...,10,35,21,4,False,7,2022,10,40,0.743749
4,4,10:35:21,2022-10-07,236371,224426,115.25,10,10,16,16,...,10,35,21,4,False,7,2022,10,40,-2.080376


In [47]:
train_df.to_csv("preprocessed_train.csv")
val_df.to_csv("preprocessed_validation.csv")
test_df.to_csv("preprocessed_test.csv")