# Credit Card Fraud End-to-End Example

## Prepare Data


In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("samayashar/fraud-detection-transactions-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/soraxas/.cache/kagglehub/datasets/samayashar/fraud-detection-transactions-dataset/versions/1


In [2]:
import pandas as pd

origin_df = pd.read_csv(f"{path}/synthetic_fraud_dataset.csv")

origin_df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.8400,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.20,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,TXN_11284,USER_4796,45.05,Online,2023-01-29 18:38:00,76960.11,Mobile,Tokyo,Clothing,0,...,2,389.00,3,Amex,98,1537.54,PIN,0.1493,1,0
49996,TXN_44732,USER_1171,126.15,POS,2023-05-09 08:55:00,28791.75,Mobile,Tokyo,Clothing,0,...,13,434.95,4,Visa,93,2555.72,Biometric,0.3653,0,1
49997,TXN_38158,USER_2510,72.02,Online,2023-01-30 19:32:00,29916.41,Laptop,Mumbai,Clothing,0,...,1,369.15,2,Visa,114,4686.59,Biometric,0.5195,0,0
49998,TXN_860,USER_2248,64.89,Bank Transfer,2023-03-09 19:47:00,67895.67,Mobile,Tokyo,Electronics,0,...,13,242.29,4,Discover,72,4886.92,Biometric,0.7063,0,1


In [3]:
# %load_ext cudf.pandas
import argparse
import os
import random
import string

from sklearn.model_selection import train_test_split

# expand original data and generate a 2-plus year data
old_max_time = origin_df["Timestamp"].max()
# old_max_days = old_max_time/3600/24
# print(f"{old_max_days=}")

# N = 4

df_temp = origin_df.copy()


# Find the maximum value in the 'Timestamp' column
max_time = df_temp["Timestamp"].max()
df = df_temp


# List of example BICs for demonstration, BIC and names are random created, they are fakes.
bic_list = {
    "ZHSZUS33": "New York",  # Bank 1
    "SHSHKHH1": "Hong Kong",  # bank 2
    "YXRXGB22": "London",  # bank 3
    "WPUWDEFF": "Berlin",  # bank 4
    "YMNYFRPP": "Paris",  # bank 5
    "FBSFCHZH": "Zurich",  # Bank 6
    "YSYCESMM": "Mumbai",  # bank 7
    "ZNZZAU3M": "Sydney",  # Bank 8
    "HCBHSGSG": "Tokyo",  # bank 9
    "XITXUS33": "New York",  # bank 10
}
bic_list_rev = {v: k for k, v in bic_list.items()}

# List of currencies and their respective countries
currencies = {
    "USD": "New York",
    "GBP": "London",
    "JPY": "Tokyo",
    "AUD": "Sydney",
    "INR": "Mumbai",
}


# BIC to Bank Name mapping
bic_to_bank = {
    "ZHSZUS33": "Bank_1",
    "SHSHKHH1": "Bank_2",
    "YXRXGB22": "Bank_3",
    "WPUWDEFF": "Bank_4",
    "YMNYFRPP": "Bank_5",
    "FBSFCHZH": "Bank_6",
    "YSYCESMM": "Bank_7",
    "ZNZZAU3M": "Bank_8",
    "HCBHSGSG": "Bank_9",
    "XITXUS33": "Bank_10",
}


# Function to generate random BICs and currency details
def generate_random_details(df):
    # Ensure the currency and beneficiary BIC match
    def match_currency_and_bic():
        while True:
            currency = random.choice(list(currencies.keys()))
            country = currencies[currency]
            matching_bics = [
                bic for bic, bic_country in bic_list.items() if bic_country == country
            ]
            if matching_bics:
                return currency, random.choice(matching_bics)

    df["Sender_BIC"] = [bic_list_rev[loc] for loc in df["Location"]]
    df["Receiver_BIC"] = [random.choice(list(bic_list.keys())) for _ in range(len(df))]
    # df['Transaction_ID'] = [generate_random_uetr() for _ in range(len(df))]

    df["Currency"], df["Beneficiary_BIC"] = zip(
        *[match_currency_and_bic() for _ in range(len(df))]
    )
    df["Currency_Country"] = df["Currency"].map(currencies)

    return df


# Add random BIC and currency details to the DataFrame
df = generate_random_details(df)

In [4]:
df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label,Sender_BIC,Receiver_BIC,Currency,Beneficiary_BIC,Currency_Country
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,883.17,Biometric,0.8494,0,0,ZNZZAU3M,YXRXGB22,USD,ZHSZUS33,New York
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,2203.36,Password,0.0959,0,1,XITXUS33,XITXUS33,INR,YSYCESMM,Mumbai
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,1909.29,Biometric,0.8400,0,1,YSYCESMM,HCBHSGSG,GBP,YXRXGB22,London
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.20,Tablet,New York,Clothing,0,...,1311.86,OTP,0.7935,0,1,XITXUS33,FBSFCHZH,GBP,YXRXGB22,London
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,966.98,Password,0.3819,1,1,YSYCESMM,YMNYFRPP,AUD,ZNZZAU3M,Sydney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,TXN_11284,USER_4796,45.05,Online,2023-01-29 18:38:00,76960.11,Mobile,Tokyo,Clothing,0,...,1537.54,PIN,0.1493,1,0,HCBHSGSG,YSYCESMM,GBP,YXRXGB22,London
49996,TXN_44732,USER_1171,126.15,POS,2023-05-09 08:55:00,28791.75,Mobile,Tokyo,Clothing,0,...,2555.72,Biometric,0.3653,0,1,HCBHSGSG,YMNYFRPP,JPY,HCBHSGSG,Tokyo
49997,TXN_38158,USER_2510,72.02,Online,2023-01-30 19:32:00,29916.41,Laptop,Mumbai,Clothing,0,...,4686.59,Biometric,0.5195,0,0,YSYCESMM,WPUWDEFF,USD,XITXUS33,New York
49998,TXN_860,USER_2248,64.89,Bank Transfer,2023-03-09 19:47:00,67895.67,Mobile,Tokyo,Electronics,0,...,4886.92,Biometric,0.7063,0,1,HCBHSGSG,YXRXGB22,INR,YSYCESMM,Mumbai


## Split Historical Train, Test Data

We are going to split the data into historical, train and test data by the following rules: 
* history : 55 %
* train : 35% 
* test : 15%





In [5]:
# Sort the DataFrame by the Time column
df = df.sort_values(by="Timestamp").reset_index(drop=True)

# Calculate the number of samples for each split
total_size = len(df)
historical_size = int(total_size * 0.55)
train_size = int(total_size * 0.35)
test_size = total_size - historical_size - train_size

# Split into historical and remaining data
df_history = df.iloc[:historical_size]
remaining_df = df.iloc[historical_size:]
y = remaining_df["Fraud_Label"]


ds = remaining_df.drop("Fraud_Label", axis=1)
# Split the remaining data into train and test
x_train, x_test, y_train, y_test = train_test_split(
    ds, y, test_size=test_size / (train_size + test_size), random_state=42
)

df_train = pd.concat([y_train, x_train], axis=1)
df_test = pd.concat([y_test, x_test], axis=1)

# Display sizes of each dataset
print(f"Historical DataFrame size: {len(df_history)}")
print(f"Training DataFrame size: {len(df_train)}")
print(f"Testing DataFrame size: {len(df_test)}")

Historical DataFrame size: 27500
Training DataFrame size: 17500
Testing DataFrame size: 5000


In [6]:
# Save training and testing sets
out_folder = "processed_data"

os.makedirs(out_folder, exist_ok=True)

df_train.to_csv(path_or_buf=os.path.join(out_folder, "train.csv"), index=False)
df_test.to_csv(path_or_buf=os.path.join(out_folder, "test.csv"), index=False)
df_history.to_csv(path_or_buf=os.path.join(out_folder, "history.csv"), index=False)

In [7]:
out_folder

'processed_data'

## Split Data for differnt Client sites

Now, split train, test, history data according to Sender_BICs

In [8]:
files = ["history", "train", "test"]
client_names = set()

for f in files:
    file_path = os.path.join(out_folder, f + ".csv")
    df = pd.read_csv(file_path)
    # Group the DataFrame by 'Sender_BIC'
    grouped = df.groupby("Sender_BIC")
    # Save each group to a separate file
    for name, group in grouped:
        bank_name = bic_to_bank[name].replace(" ", "_")
        client_name = f"{name}_{bank_name}"
        client_names.add(client_name)
        site_dir = os.path.join(out_folder, client_name)
        os.makedirs(site_dir, exist_ok=True)

        filename = os.path.join(site_dir, f"{f}.csv")
        group.to_csv(filename, index=False)
        print(f"Saved {name} {f} transactions to {filename}")

print(client_names)

Saved HCBHSGSG history transactions to processed_data/HCBHSGSG_Bank_9/history.csv
Saved XITXUS33 history transactions to processed_data/XITXUS33_Bank_10/history.csv
Saved YSYCESMM history transactions to processed_data/YSYCESMM_Bank_7/history.csv
Saved YXRXGB22 history transactions to processed_data/YXRXGB22_Bank_3/history.csv
Saved ZNZZAU3M history transactions to processed_data/ZNZZAU3M_Bank_8/history.csv
Saved HCBHSGSG train transactions to processed_data/HCBHSGSG_Bank_9/train.csv
Saved XITXUS33 train transactions to processed_data/XITXUS33_Bank_10/train.csv
Saved YSYCESMM train transactions to processed_data/YSYCESMM_Bank_7/train.csv
Saved YXRXGB22 train transactions to processed_data/YXRXGB22_Bank_3/train.csv
Saved ZNZZAU3M train transactions to processed_data/ZNZZAU3M_Bank_8/train.csv
Saved HCBHSGSG test transactions to processed_data/HCBHSGSG_Bank_9/test.csv
Saved XITXUS33 test transactions to processed_data/XITXUS33_Bank_10/test.csv
Saved YSYCESMM test transactions to processed

Let's go back to the [XGBoost Notebook](../xgboost.ipynb)