### Imports

In [57]:
# Import necessary libraries
import warnings
import math
import ast
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.combine import SMOTEENN

### Ignoring warnings

In [58]:
# Suppress all warnings
warnings.filterwarnings("ignore")

### Load the dataset

In [59]:
# Setting dataset folder path
FOLDER_PATH = "../data/"

train_data = pd.read_csv(f"{FOLDER_PATH}/train.csv")
customer_data = pd.read_csv(f"{FOLDER_PATH}/customer.csv")
terminal_data = pd.read_csv(f"{FOLDER_PATH}/terminal.csv")

train  = train_data.merge(customer_data, how = "left", on = "CUSTOMER_ID").merge(terminal_data, how = "left", on = "TERMINAL_ID")
train.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,"[51, 68, 208, 217, 293, 353, 534, 717, 773, 86...",19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,"[163, 172, 205, 429, 468, 607, 750, 786, 881, ...",16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,"[58, 799, 1011, 1021, 1228, 1347, 1443, 1462, ...",13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,"[27, 493, 584, 734, 820, 917, 1108, 1363, 1444...",15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,"[154, 177, 351, 444, 633, 739, 1018, 1056, 113...",20,62.417972,26.483666


### Feature Engineering

#### 1. Extracting time-based features

In [60]:

# fixing datetime column datatype
train['TX_DATETIME'] = pd.to_datetime(train['TX_DATETIME'])

train['TX_HOUR'] = train['TX_DATETIME'].dt.hour
train['TX_DAY'] = train['TX_DATETIME'].dt.day
train['TX_MONTH'] = train['TX_DATETIME'].dt.month
train['TX_WEEKDAY'] = train['TX_DATETIME'].dt.weekday

#### 2. Customer behaviour features

In [61]:
# Customer Fraud History (Has customer been involved in fraud before?)
customer_fraud_history = train.groupby('CUSTOMER_ID')['TX_FRAUD'].max()
train = train.merge(customer_fraud_history.rename('CUSTOMER_FRAUD_HISTORY'), on='CUSTOMER_ID', how='left')


#### 3. Terminal based features

In [62]:
# Feature Engineering: Creating new feature for transaction frequency per terminal
train['tx_per_terminal'] = train.groupby('TERMINAL_ID')['TRANSACTION_ID'].transform('count')

# Terminal Fraud Rate (Proportion of fraudulent transactions at each terminal)
terminal_fraud_rate = train.groupby('TERMINAL_ID')['TX_FRAUD'].mean()
train = train.merge(terminal_fraud_rate.rename('TERMINAL_FRAUD_RATE'), on='TERMINAL_ID', how='left')

# isAllowedTerminal feature

#fixing available_terminals datatype
train['available_terminals'] = train['available_terminals'].apply(ast.literal_eval)

train['isAllowedTerminal'] = train.apply(lambda row: int(row['TERMINAL_ID'] in row['available_terminals']), axis=1)

#### 4. Transaction based features

In [63]:

# Transaction Amount Compared to Terminal's Typical Amount (Relative to Terminal's Average)
terminal_avg_tx = train.groupby('TERMINAL_ID')['TX_AMOUNT'].mean()
train = train.merge(terminal_avg_tx.rename('TERMINAL_AVG_TX_AMOUNT'), on='TERMINAL_ID', how='left')
train['TX_AMOUNT_REL_TO_TERMINAL_AVG'] = train['TX_AMOUNT'] / train['TERMINAL_AVG_TX_AMOUNT']

#### 5. Geographical features

In [64]:
# Define the Haversine function to calculate distance between two coordinates
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c

    return distance

# Apply the Haversine function row-wise
train['Distance_km'] = train.apply(lambda row: haversine(row['x_customer_id'], row['y_customer_id'], row['x_terminal_id'], row['y_terminal_id']), axis=1).round()

In [65]:
train.columns

Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_FRAUD', 'x_customer_id', 'y_customer_id',
       'mean_amount', 'std_amount', 'mean_nb_tx_per_day',
       'available_terminals', 'nb_terminals', 'x_terminal_id', 'y_terminal_id',
       'TX_HOUR', 'TX_DAY', 'TX_MONTH', 'TX_WEEKDAY', 'CUSTOMER_FRAUD_HISTORY',
       'tx_per_terminal', 'TERMINAL_FRAUD_RATE', 'isAllowedTerminal',
       'TERMINAL_AVG_TX_AMOUNT', 'TX_AMOUNT_REL_TO_TERMINAL_AVG',
       'Distance_km'],
      dtype='object')

# Feature selection

In [66]:
features = ['TRANSACTION_ID', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT','mean_amount', 'std_amount', 'mean_nb_tx_per_day',
       'TX_HOUR', 'TX_DAY', 'TX_MONTH', 'TX_WEEKDAY', 'CUSTOMER_FRAUD_HISTORY', 'tx_per_terminal', 'TERMINAL_FRAUD_RATE',
       'isAllowedTerminal', 'TERMINAL_AVG_TX_AMOUNT',
       'TX_AMOUNT_REL_TO_TERMINAL_AVG', 'Distance_km']

X = train[features]
y = train['TX_FRAUD']

# Spliting the dataset

In [67]:
# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTEENN (Combination of SMOTE and ENN) to balance the dataset
# SMOTE oversamples the minority class, and ENN (Edited Nearest Neighbors) undersamples the majority class
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# Finding best estimaters for XGBClassifier model using RandomizedSearchCV

In [68]:
file_path = '../hyperparameters/xgb_estimaters.json'

# If hyperparameters already generated, then we won't generate it again
if not os.path.exists(file_path):
    print("file not exist ", file_path)
    print("Generating file...")

    param_dist = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'gamma': [0, 0.1, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [0.8, 1.0, 1.2]
        }
    
    # Set up RandomizedSearchCV with 50 iterations
    Model = RandomizedSearchCV(estimator= XGBClassifier(random_state = 42), param_distributions= param_dist, 
                                   n_iter=50, cv=5, n_jobs=-1, scoring='precision', random_state=42)
    
    # Fit the model using RandomizedSearchCV
    Model.fit(X_resampled, y_resampled)
    #Model.best_estimator_.random_state = 42

    # Print the best parameters and the best score
    print("Best parameters found: ", Model.best_estimator_)
    print("Best cross-validation score: ", Model.best_score_)
    print(f"Saving estimaters at {file_path}")
    
    estimaters = Model.best_estimator_.get_params()
    # Writing dictionary to a JSON file
    with open(file_path, 'w') as json_file:
        json.dump(estimaters, json_file, indent=4)  # 'indent' makes the JSON file readable

else:
    print("file exist")
    # Reading the JSON file as a dictionary
    with open(file_path, 'r') as json_file:
        estimaters = json.load(json_file)
    print("Best parameters found: ", estimaters)
    


file exist
Best parameters found:  {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.2, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 10, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 0.01, 'reg_lambda': 1.2, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.7, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


# Training the Model

In [69]:
# Initialize the model
xgb_model = XGBClassifier(**estimaters) 
    
# Fit the model
xgb_model.fit(X_resampled, y_resampled)

# Make predictions

In [70]:

y_pred = xgb_model.predict(X_test_scaled)

# Display the evaluation results
print(f"Accuracy:- {round(accuracy_score(y_test, y_pred)*100,2)}%")
print(f"Precision:- {round(precision_score(y_test, y_pred, average='macro', zero_division=0)*100,2)}%")
print(f"Recall:- {round(recall_score(y_test, y_pred, average='macro', zero_division=0)*100,2)}%")

Accuracy:- 98.16%
Precision:- 77.55%
Recall:- 87.74%
