# PreProcess Step

* Encode categorical data
* normalize the features


### Load Data

In [1]:
site_input_dir = "processed_data"
site_name = "HCBHSGSG_Bank_9"

In [2]:
import os
import random
import string

import pandas as pd

dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv")
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

      Unnamed: 0            Timestamp  Fraud_Label Transaction_ID    User_ID  \
0              0  2023-07-21 00:39:00            0      TXN_15663  USER_7774   
1              1  2023-07-21 00:40:00            0      TXN_16666  USER_2491   
2              2  2023-07-21 01:16:00            0      TXN_13793  USER_7209   
3              3  2023-07-21 02:21:00            0      TXN_33256  USER_9146   
4              4  2023-07-21 04:37:00            0      TXN_49460  USER_6842   
...          ...                  ...          ...            ...        ...   
3596        3596  2023-12-31 20:35:00            1      TXN_37832  USER_5830   
3597        3597  2023-12-31 21:38:00            1      TXN_38974  USER_4134   
3598        3598  2023-12-31 21:52:00            0      TXN_25394  USER_8356   
3599        3599  2023-12-31 21:55:00            0      TXN_41124  USER_7621   
3600        3600  2023-12-31 22:35:00            0      TXN_10811  USER_1721   

      Transaction_Amount Transaction_Ty

### Categorical encoding

In [3]:
category_columns = [
    "Currency_Country",
    "Beneficiary_BIC",
    "Currency",
    "Transaction_ID",
    "Receiver_BIC",
    "Sender_BIC",
]

for ds_name in dataset_names:
    df = datasets[ds_name]
    df_encoded = pd.get_dummies(df, columns=category_columns)
    print(df_encoded)

      Unnamed: 0            Timestamp  Fraud_Label    User_ID  \
0              0  2023-07-21 00:39:00            0  USER_7774   
1              1  2023-07-21 00:40:00            0  USER_2491   
2              2  2023-07-21 01:16:00            0  USER_7209   
3              3  2023-07-21 02:21:00            0  USER_9146   
4              4  2023-07-21 04:37:00            0  USER_6842   
...          ...                  ...          ...        ...   
3596        3596  2023-12-31 20:35:00            1  USER_5830   
3597        3597  2023-12-31 21:38:00            1  USER_4134   
3598        3598  2023-12-31 21:52:00            0  USER_8356   
3599        3599  2023-12-31 21:55:00            0  USER_7621   
3600        3600  2023-12-31 22:35:00            0  USER_1721   

      Transaction_Amount Transaction_Type  Account_Balance Device_Type  \
0                 183.33   ATM Withdrawal         57770.62      Tablet   
1                  30.14    Bank Transfer         13256.87      Mobile 

### Normalization

In [4]:
df.columns

Index(['Unnamed: 0', 'Timestamp', 'Fraud_Label', 'Transaction_ID', 'User_ID',
       'Transaction_Amount', 'Transaction_Type', 'Account_Balance',
       'Device_Type', 'Location', 'Merchant_Category', 'IP_Address_Flag',
       'Previous_Fraudulent_Activity', 'Daily_Transaction_Count',
       'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Type',
       'Card_Age', 'Transaction_Distance', 'Authentication_Method',
       'Risk_Score', 'Is_Weekend', 'Sender_BIC', 'Receiver_BIC', 'Currency',
       'Beneficiary_BIC', 'Currency_Country', 'trans_volume', 'total_amount',
       'average_amount', 'hist_trans_volume', 'hist_total_amount',
       'hist_average_amount', 'x2_y1', 'x3_y2'],
      dtype='object')

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


processed_dfs = {}

numerical_columns = [
    "Timestamp",
    "Fraud_Label",
    "Transaction_Amount",
    "trans_volume",
    "total_amount",
    "average_amount",
    "hist_trans_volume",
    "hist_total_amount",
    "hist_average_amount",
    "x2_y1",
    "x3_y2",
]

for ds_name in dataset_names:
    df = datasets[ds_name]

    # Convert 'Timestamp' column to datetime
    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    # Convert datetime to Unix timestamp
    df["Timestamp"] = df["Timestamp"].astype(int) / 10**9  # convert to seconds

    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()

    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(
        scaler.fit_transform(numerical_features), columns=numerical_features.columns
    )

    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)

    #     # one-hot encoding
    #     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)

    processed_dfs[ds_name] = df_combined

Combined DataFrame with Normalized Numerical Features:
     Currency_Country Beneficiary_BIC Currency Transaction_ID Receiver_BIC  \
0              Sydney        ZNZZAU3M      AUD      TXN_15663     WPUWDEFF   
1              Mumbai        YSYCESMM      INR      TXN_16666     ZHSZUS33   
2               Tokyo        HCBHSGSG      JPY      TXN_13793     ZHSZUS33   
3              Mumbai        YSYCESMM      INR      TXN_33256     WPUWDEFF   
4            New York        ZHSZUS33      USD      TXN_49460     XITXUS33   
...               ...             ...      ...            ...          ...   
3596           Mumbai        YSYCESMM      INR      TXN_37832     ZNZZAU3M   
3597           Mumbai        YSYCESMM      INR      TXN_38974     HCBHSGSG   
3598           Sydney        ZNZZAU3M      AUD      TXN_25394     HCBHSGSG   
3599           London        YXRXGB22      GBP      TXN_41124     ZHSZUS33   
3600           Sydney        ZNZZAU3M      AUD      TXN_10811     YMNYFRPP   

     Sen

In [6]:
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name)

processed_data/HCBHSGSG_Bank_9/train_normalized.csv
processed_data/HCBHSGSG_Bank_9/test_normalized.csv


Let's go back to the [XGBoost Notebook](../xgboost.ipynb)