# Import necessary libraries
This section imports all the required libraries for data processing, scaling, encoding, and visualization.
Libraries: pandas, numpy, matplotlib.pyplot, datetime, StandardScaler, OrdinalEncoder.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [2]:
data = pd.read_csv("data/raw/fraudTrain.csv").drop(columns="Unnamed: 0")

# Define variables
This section defines the categorical, continuous, target, and post-processing variables that will be used throughout the analysis.
Variables: cat_vars, cont_vars, target, post_cont_vars.

In [3]:
cat_vars = ["merchant", "category", "gender", "city", "state", "job"]
cont_vars = ["amt", "lat", "long", "city_pop", "merch_lat", "merch_long"]
target = ["is_fraud"]
post_cont_vars = ["mean_monthly_amt", "amount_of_monthly_trans", "age", "mean_time_between_transactions_seconds"]

# Function to process the data
This function processes the raw data by converting date columns, extracting time-related features, calculating new features (mean monthly amount, amount of monthly transactions, age, and mean time between transactions), and merging them into the original dataset.
Features extracted: transaction_month, transaction_day, transaction_hour, full_name, mean_monthly_amt, amount_of_monthly_trans, age, time_diff, mean_time_between_transactions_seconds.
    

In [4]:
def process_data(data):
    # Convert date columns to datetime
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
    data['dob'] = pd.to_datetime(data['dob'])
    
    # Extract additional time-related features
    data['transaction_month'] = pd.DatetimeIndex(data['trans_date_trans_time']).month
    data['transaction_day'] = pd.DatetimeIndex(data['trans_date_trans_time']).day
    data['transaction_hour'] = pd.DatetimeIndex(data['trans_date_trans_time']).hour
    
    # Create full name column
    data["full_name"] = data["first"] + " " + data["last"]    

    # Calculate mean monthly amount per user
    grouped_data = data[["full_name", "transaction_month", "amt"]].groupby(by=["full_name", "transaction_month"]).sum()
    grouped_data = grouped_data.reset_index()
    mean_transaction_month_per_user = grouped_data.groupby("full_name")["amt"].mean().reset_index()
    mean_transaction_month_per_user.columns = ["full_name", "mean_monthly_amt"]
    
    # Calculate amount of monthly transactions per user
    grouped_data = data[["full_name", "transaction_month", "amt"]].groupby(by=["full_name", "transaction_month"]).count()
    grouped_data = grouped_data.reset_index()
    amount_transaction_hour_per_user = grouped_data.groupby("full_name")["amt"].sum().reset_index()
    amount_transaction_hour_per_user.columns = ["full_name", "amount_of_monthly_trans"]

    # Merge calculated features into the original dataset
    data = data.merge(mean_transaction_month_per_user.merge(amount_transaction_hour_per_user, how='inner', on="full_name"), how='inner', on="full_name")

    # Calculate age
    data['Current Year'] = datetime.datetime.now().year
    data["age"] = data['Current Year'] - pd.DatetimeIndex(data['dob']).year

    # Calculate time differences between transactions
    data['datetime'] = pd.to_datetime(data['unix_time'], unit='s')
    data.sort_values(by=['full_name', 'datetime'], inplace=True)    
    data['time_diff'] = data.groupby('full_name')['datetime'].diff()    
    data['time_diff_seconds'] = data['time_diff'].dt.total_seconds()
    
    # Calculate mean time between transactions per user
    mean_time_between_transactions = data.groupby('full_name')['time_diff_seconds'].mean().reset_index()
    mean_time_between_transactions.columns = ['full_name', 'mean_time_between_transactions_seconds']

    # Merge calculated features into the original dataset
    data = data.merge(mean_time_between_transactions, on="full_name", how="inner")

    return data

In [5]:
 # Function to scale continuous variables
def scale_continuous_variables(df, cont_vars):
    scaler = StandardScaler()
    df[cont_vars] = scaler.fit_transform(df[cont_vars])
    return df, scaler

In [6]:
# Function to encode ordinal variables
def encode_ordinal_variables(df, ord_vars):   
    encoders = {}
    for col in ord_vars:
        encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        df[[col]] = encoder.fit_transform(df[[col]])
        encoders[col] = encoder
    return df, encoders

# Process the training data
This section processes the training data using the previously defined functions and selects the relevant columns.
Processing includes applying process_data, scale_continuous_variables, and encode_ordinal_variables.

In [7]:
data = process_data(data)

# Select relevant columns    
data = data[cat_vars + cont_vars + post_cont_vars + target]

# Scale continuous variables
data, scaler = scale_continuous_variables(data, cont_vars + post_cont_vars)

# Encode ordinal variables
data, encoders = encode_ordinal_variables(data, cat_vars)

# Load and process the test data
This section loads the test data, processes it using the same steps as the training data, and selects the relevant columns.
Test data is read from data/raw/fraudTest.csv and processed similarly to the training data.
Apply scaling and encoding to test data

In [8]:
test_data = pd.read_csv("data/raw/fraudTest.csv").drop(columns="Unnamed: 0")
test_data = process_data(test_data)
test_data = test_data[cat_vars + cont_vars + post_cont_vars + target]

# Apply scaling and encoding to test data
test_data[cont_vars + post_cont_vars] = scaler.transform(test_data[cont_vars + post_cont_vars])
for col in cat_vars:
    test_data[[col]] = encoders[col].transform(test_data[[col]])

In [9]:
# Save the processed datasets
data.to_csv("data/processed/train.csv")
test_data.to_csv("data/processed/test.csv")