In [126]:
# %% Importing Libarries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [127]:
DATA_DIR = "SAML"

In [128]:
df_train = pd.read_csv(f"{DATA_DIR}/train_transactions.csv")
df_test = pd.read_csv(f"{DATA_DIR}/test_transactions.csv")

In [129]:
df_train.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,3293686547,2857585278,591.16,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Small_Fan_Out,2023,5,5,18
1,345409480,3629277366,186.54,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Small_Fan_Out,2022,10,21,42
2,2207083075,8166004515,9368.07,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_Out,2023,5,3,18
3,1715402599,1146874022,9453.61,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_In,2023,7,12,28
4,3059424812,3532465761,2306.49,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Small_Fan_Out,2023,2,25,8


In [136]:
df_test.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,Year,Month,Day,Week
0,6077900993,6609117934,2299.89,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Group,2022,10,8,40
1,5459041199,816242179,211.9,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out,2023,7,8,27
2,4641798002,5214523833,9481.26,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In,2023,6,29,26
3,1475899462,6151077027,11609.92,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_Out,2023,2,27,9
4,8046287266,3423799664,17772.58,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In,2023,6,14,24


In [137]:
numerical_features = df_train.select_dtypes(exclude="object").columns
numerical_features

Index(['Sender_account', 'Receiver_account', 'Amount', 'Is_laundering', 'Year',
       'Month', 'Day', 'Week'],
      dtype='object')

In [138]:
categorical_features = df_train.select_dtypes(include="object").columns
categorical_features

Index(['Payment_currency', 'Received_currency', 'Sender_bank_location',
       'Receiver_bank_location', 'Payment_type', 'Laundering_type'],
      dtype='object')

In [139]:
# droping feature on which I don't want any tranformation
numerical_features = numerical_features.drop(["Is_laundering","Sender_account", "Receiver_account","Year", "Month", "Day", "Week"])
numerical_features

Index(['Amount'], dtype='object')

In [140]:
other_columns = ["Is_laundering","Sender_account", "Receiver_account","Year", "Month", "Day", "Week"]

In [141]:
print(f"{categorical_features = }\n{numerical_features = }\n{other_columns = }")

categorical_features = Index(['Payment_currency', 'Received_currency', 'Sender_bank_location',
       'Receiver_bank_location', 'Payment_type', 'Laundering_type'],
      dtype='object')
numerical_features = Index(['Amount'], dtype='object')
other_columns = ['Is_laundering', 'Sender_account', 'Receiver_account', 'Year', 'Month', 'Day', 'Week']


In [142]:
# Function to transform numerical features
def transform_numerical(data, num_imputer, num_scaler, numerical_features):
    # Impute missing values
    numerical_imputed = num_imputer.transform(data[numerical_features])
    # Scale the data
    numerical_scaled = num_scaler.transform(numerical_imputed)
    # Convert to DataFrame to preserve column names
    return pd.DataFrame(numerical_scaled, columns=numerical_features)


# Function to transform categorical features
def transform_categorical(data, cat_imputer, cat_encoder, categorical_features):
    # Impute missing values
    categorical_imputed = cat_imputer.transform(data[categorical_features])
    # Encode the data
    categorical_encoded = cat_encoder.transform(categorical_imputed)
    # Convert to DataFrame to preserve column names
    return pd.DataFrame(categorical_encoded, columns=categorical_features)

In [143]:
# Function to fit and transform df_train (training data)
def fit_and_transform_train(df_train_transformed, numerical_features, categorical_features):
    # Step 1: Fit SimpleImputer and RobustScaler for numerical features
    num_imputer = SimpleImputer(strategy='median')
    num_scaler = RobustScaler()
    num_imputer.fit(df_train_transformed[numerical_features])
    num_scaler.fit(df_train_transformed[numerical_features])

    # Step 2: Fit SimpleImputer and OrdinalEncoder for categorical features
    cat_imputer = SimpleImputer(strategy='most_frequent')
    cat_encoder = OrdinalEncoder()
    cat_imputer.fit(df_train_transformed[categorical_features])
    cat_encoder.fit(df_train_transformed[categorical_features])

    # Step 3: Apply transformations
    df_train_transformed[numerical_features] = transform_numerical(df_train_transformed, num_imputer, num_scaler, numerical_features)
    df_train_transformed[categorical_features] = transform_categorical(df_train_transformed, cat_imputer, cat_encoder, categorical_features)

    return df_train_transformed, num_imputer, num_scaler, cat_imputer, cat_encoder

In [144]:
# Function to transform df_test (test data) using the fitted transformers
def transform_test(df_test_transformed, num_imputer, num_scaler, cat_imputer, cat_encoder, numerical_features, categorical_features):
    df_test_transformed[numerical_features] = transform_numerical(df_test_transformed, num_imputer, num_scaler, numerical_features)
    df_test_transformed[categorical_features] = transform_categorical(df_test_transformed, cat_imputer, cat_encoder, categorical_features)

    return df_test_transformed

In [145]:
# Fit and transform the training data
df_train_transformed = df_train.copy(deep=True)
df_train_transformed, num_imputer, num_scaler, cat_imputer, cat_encoder = fit_and_transform_train(
    df_train_transformed, numerical_features, categorical_features)



In [146]:
df_train_transformed.to_csv(f'{DATA_DIR}/train_transactions_transformed.csv', index=False)


In [147]:
# Transform the test data using the fitted transformers
df_test_transformed = df_test.copy(deep=True)
df_test_transformed = transform_test(df_test_transformed, num_imputer, num_scaler, cat_imputer, cat_encoder, 
                         numerical_features, categorical_features)



In [148]:
df_test_transformed.to_csv(f'{DATA_DIR}/test_transactions_transformed.csv', index=False)


In [114]:
all_accounts = pd.concat([df_train['Sender_account'], df_train['Receiver_account']]).unique()
account_to_index = {account: idx for idx, account in enumerate(all_accounts)}
num_accounts = len(all_accounts)

In [110]:
df_train[df_train['Sender_account']==92172]['Sender_bank_location'].unique().shape[0]

2

In [153]:
df_train_transformed['Laundering_type'].values[0]

np.float64(20.0)

In [125]:
df_train_transformed['Sender_bank_location'].unique().max()

np.float64(17.0)