In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score, recall_score
from xgboost import XGBClassifier
import xgboost as xgb

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.impute import SimpleImputer

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')   

In [2]:
# Load training data (transactions)
train_data = pd.read_csv('/Users/sakshiii/Desktop/credit_card_fraud_detection/data/train.csv')       

# Load customer data
customer_data = pd.read_csv('/Users/sakshiii/Desktop/credit_card_fraud_detection/data/customer.csv') 

# Load terminal (merchant) data
terminal_data = pd.read_csv('/Users/sakshiii/Desktop/credit_card_fraud_detection/data/terminal.csv') 

In [3]:
# Merge the data
train_df = pd.merge(train_data, customer_data, on='CUSTOMER_ID', how='left')  
train_df = pd.merge(train_data, terminal_data, on='TERMINAL_ID', how='left')  

In [4]:
# Convert TX_DATETIME to datetime format
train_df['TX_DATETIME'] = pd.to_datetime(train_df['TX_DATETIME'])

# Extracting features
train_df["hour"] = train_df['TX_DATETIME'].dt.hour
train_df['day_of_week'] = train_df['TX_DATETIME'].dt.day_of_week
train_df['month'] = train_df['TX_DATETIME'].dt.month

# Transaction Recency (Time since last transaction per customer)
train_df['CUSTOMER_RECENCY'] = train_df.groupby('CUSTOMER_ID')['TX_DATETIME'].diff().dt.total_seconds()
train_df['TERMINAL_RECENCY'] = train_df.groupby('TERMINAL_ID')['TX_DATETIME'].diff().dt.total_seconds()

In [5]:
# Customer Transaction Frequency (transactions per day)
train_df['TX_DATE'] = train_df['TX_DATETIME'].dt.date
customer_tx_count = train_df.groupby(['CUSTOMER_ID', 'TX_DATE']).size().groupby('CUSTOMER_ID').mean()
train_df = train_df.merge(customer_tx_count.rename('CUSTOMER_TX_FREQUENCY'), on='CUSTOMER_ID', how='left')

# Customer Fraud History (Has customer been involved in fraud before?)
customer_fraud_history = train_df.groupby('CUSTOMER_ID')['TX_FRAUD'].max()
train_df = train_df.merge(customer_fraud_history.rename('CUSTOMER_FRAUD_HISTORY'), on='CUSTOMER_ID', how='left')

# Average Transaction Amount for the Customer
customer_avg_tx = train_df.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean()
train_df = train_df.merge(customer_avg_tx.rename('CUSTOMER_AVG_TX_AMOUNT'), on='CUSTOMER_ID', how='left')

In [6]:
# Terminal Usage (Total transactions at each terminal)
terminal_tx_count = train_df.groupby('TERMINAL_ID').size()
train_df = train_df.merge(terminal_tx_count.rename('TERMINAL_TX_COUNT'), on='TERMINAL_ID', how='left')

# Terminal Fraud Rate (Proportion of fraudulent transactions at each terminal)
terminal_fraud_rate = train_df.groupby('TERMINAL_ID')['TX_FRAUD'].mean()
train_df = train_df.merge(terminal_fraud_rate.rename('TERMINAL_FRAUD_RATE'), on='TERMINAL_ID', how='left')


In [7]:
# Transaction Amount Relative to Customer's Average
train_df['TX_AMOUNT_REL_TO_AVG'] = train_df['TX_AMOUNT'] / train_df['CUSTOMER_AVG_TX_AMOUNT']

# Transaction Amount Compared to Terminal's Typical Amount (Relative to Terminal's Average)
terminal_avg_tx = train_df.groupby('TERMINAL_ID')['TX_AMOUNT'].mean()
train_df = train_df.merge(terminal_avg_tx.rename('TERMINAL_AVG_TX_AMOUNT'), on='TERMINAL_ID', how='left')
train_df['TX_AMOUNT_REL_TO_TERMINAL_AVG'] = train_df['TX_AMOUNT'] / train_df['TERMINAL_AVG_TX_AMOUNT']


In [8]:
train_df.columns

Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_FRAUD', 'x_terminal_id', 'y_terminal_id', 'hour',
       'day_of_week', 'month', 'CUSTOMER_RECENCY', 'TERMINAL_RECENCY',
       'TX_DATE', 'CUSTOMER_TX_FREQUENCY', 'CUSTOMER_FRAUD_HISTORY',
       'CUSTOMER_AVG_TX_AMOUNT', 'TERMINAL_TX_COUNT', 'TERMINAL_FRAUD_RATE',
       'TX_AMOUNT_REL_TO_AVG', 'TERMINAL_AVG_TX_AMOUNT',
       'TX_AMOUNT_REL_TO_TERMINAL_AVG'],
      dtype='object')

In [9]:
train_df['CUSTOMER_RECENCY'].fillna(train_df['CUSTOMER_RECENCY'].mean())

0         44327.559985
1         44327.559985
2         44327.559985
3         44327.559985
4         44327.559985
              ...     
291226    36420.000000
291227    22085.000000
291228    55768.000000
291229    37483.000000
291230    31763.000000
Name: CUSTOMER_RECENCY, Length: 291231, dtype: float64

In [10]:
features = ['CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT','hour', 'day_of_week', 'month',
        'CUSTOMER_TX_FREQUENCY', 'CUSTOMER_FRAUD_HISTORY',
       'CUSTOMER_AVG_TX_AMOUNT', 'TERMINAL_TX_COUNT', 'TERMINAL_FRAUD_RATE',
       'TX_AMOUNT_REL_TO_AVG', 'TERMINAL_AVG_TX_AMOUNT',
       'TX_AMOUNT_REL_TO_TERMINAL_AVG']

x = train_df[features]
y = train_df['TX_FRAUD']

In [11]:
x.isnull().sum()

CUSTOMER_ID                      0
TERMINAL_ID                      0
TX_AMOUNT                        0
hour                             0
day_of_week                      0
month                            0
CUSTOMER_TX_FREQUENCY            0
CUSTOMER_FRAUD_HISTORY           0
CUSTOMER_AVG_TX_AMOUNT           0
TERMINAL_TX_COUNT                0
TERMINAL_FRAUD_RATE              0
TX_AMOUNT_REL_TO_AVG             0
TERMINAL_AVG_TX_AMOUNT           0
TX_AMOUNT_REL_TO_TERMINAL_AVG    0
dtype: int64

In [13]:
# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)


# Standardize the features (optional but recommended for XGBoost)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# # Apply SMOTEENN (Combination of SMOTE and ENN) to balance the dataset
# # SMOTE oversamples the minority class, and ENN (Edited Nearest Neighbors) undersamples the majority class
# smote_enn = SMOTEENN(random_state=42)
# X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# 3. Training the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,  
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_scaled, y_train)

# 4. Model Evaluation
# Predict on test data
y_pred = xgb_model.predict(X_test_scaled)



In [14]:
# Display the evaluation results
print(f"Accuracy : {round(accuracy_score(y_test, y_pred)*100,2)}%" )
print(f"Precision : {round(precision_score(y_test, y_pred, average='macro', zero_division=0)*100,2)}%")
print(f"Recall : {round(recall_score(y_test, y_pred, average='macro', zero_division=0)*100,2)}%")

Accuracy : 97.79%
Precision : 85.72%
Recall : 51.68%
