In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
import warnings
import math
import ast
import json
import os

In [2]:
# Suppress all warnings
warnings.filterwarnings("ignore")

# Load the dataset

In [3]:
# Setting dataset folder path
FOLDER_PATH = "../data/"

train_data = pd.read_csv(f"{FOLDER_PATH}/train.csv")
customer_data = pd.read_csv(f"{FOLDER_PATH}/customer.csv")
terminal_data = pd.read_csv(f"{FOLDER_PATH}/terminal.csv")

train  = train_data.merge(customer_data, how = "left", on = "CUSTOMER_ID").merge(terminal_data, how = "left", on = "TERMINAL_ID")
train.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,"[51, 68, 208, 217, 293, 353, 534, 717, 773, 86...",19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,"[163, 172, 205, 429, 468, 607, 750, 786, 881, ...",16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,"[58, 799, 1011, 1021, 1228, 1347, 1443, 1462, ...",13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,"[27, 493, 584, 734, 820, 917, 1108, 1363, 1444...",15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,"[154, 177, 351, 444, 633, 739, 1018, 1056, 113...",20,62.417972,26.483666


# Features creation

In [4]:
# Extracting time-based features: hour, day, month

# fixing datetime column datatype
train['TX_DATETIME'] = pd.to_datetime(train['TX_DATETIME'])
train['TX_HOUR'] = train['TX_DATETIME'].dt.hour
train['TX_DAY'] = train['TX_DATETIME'].dt.day
train['TX_MONTH'] = train['TX_DATETIME'].dt.month
train['TX_WEEKDAY'] = train['TX_DATETIME'].dt.weekday

In [5]:
# Feature Engineering: Creating new feature for transaction frequency per terminal
train['tx_per_terminal'] = train.groupby('TERMINAL_ID')['TRANSACTION_ID'].transform('count')

In [6]:
# Define the Haversine function to calculate distance between two coordinates
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c

    return distance

# Apply the Haversine function row-wise
train['Distance_km'] = train.apply(lambda row: haversine(row['x_customer_id'], row['y_customer_id'], row['x_terminal_id'], row['y_terminal_id']), axis=1).round()

In [7]:
# isAllowedTerminal feature

#fixing available_terminals datatype
train['available_terminals'] = train['available_terminals'].apply(ast.literal_eval)

train['isAllowedTerminal'] = train.apply(lambda row: int(row['TERMINAL_ID'] in row['available_terminals']), axis=1)

In [8]:
train.columns

Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_FRAUD', 'x_customer_id', 'y_customer_id',
       'mean_amount', 'std_amount', 'mean_nb_tx_per_day',
       'available_terminals', 'nb_terminals', 'x_terminal_id', 'y_terminal_id',
       'TX_HOUR', 'TX_DAY', 'TX_MONTH', 'TX_WEEKDAY', 'tx_per_terminal',
       'Distance_km', 'isAllowedTerminal'],
      dtype='object')

# Feature selection

In [9]:
features = ['TRANSACTION_ID', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'x_customer_id', 'y_customer_id',
       'mean_amount', 'std_amount', 'mean_nb_tx_per_day', 
       'x_terminal_id', 'y_terminal_id', 'TX_HOUR', 
       'TX_DAY', 'TX_MONTH', 'TX_WEEKDAY', 'tx_per_terminal',
       'Distance_km', 'isAllowedTerminal']
X = train[features]
y = train['TX_FRAUD']

# Spliting the dataset

In [10]:
# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select Model (RandomForestClassifier , LGBMClassifier, XGBClassifier)

In [12]:
selected_model = input("Select model rf, lgb or xgb")
if selected_model not in Models_list:
    raise Exception("Please choose model from the list")

# Finding best estimaters for model

In [13]:
Models_list = {"xgb" : {'file_name': 'xgb_estimaters.json',
                        'Model': XGBClassifier(),
                        'param_dist' : {
                                'n_estimators': [50, 100, 200],
                                'max_depth': [3, 5, 10],
                                'learning_rate': [0.01, 0.1, 0.2],
                                'subsample': [0.7, 0.8, 1.0],
                                'colsample_bytree': [0.7, 0.8, 1.0],
                                'gamma': [0, 0.1, 0.3],
                                'reg_alpha': [0, 0.01, 0.1],
                                'reg_lambda': [0.8, 1.0, 1.2]
                        }
                },
                "lgb" : {'file_name': 'lgb_estimaters.json',
                        'Model': LGBMClassifier(),
                        'param_dist' : {
                                'n_estimators': [100, 300, 500, 1000],          # Number of boosting iterations (trees)
                                'learning_rate': [0.01, 0.05, 0.1, 0.2],        # Step size shrinkage
                                'num_leaves': [20, 31, 40, 50],                 # Max number of leaves per tree
                                'max_depth': [-1, 10, 20, 30],                  # Maximum tree depth (-1 means no limit)
                                'min_child_samples': [10, 20, 30, 50],          # Minimum number of data in one leaf
                                'min_child_weight': [1e-3, 1e-2, 1e-1, 1],      # Minimum sum of instance weight (hessian) in a leaf
                                'subsample': [0.6, 0.7, 0.8, 1.0],              # Fraction of data to be randomly sampled for each tree
                                'colsample_bytree': [0.6, 0.7, 0.8, 1.0],       # Fraction of features to be randomly sampled for each tree
                                'reg_alpha': [0, 0.1, 0.5, 1.0],                # L1 regularization term
                                'reg_lambda': [0, 0.1, 0.5, 1.0],               # L2 regularization term
                                'boosting_type': ['gbdt', 'dart'],              # Boosting type, GBDT (Gradient Boosting) or DART
                                'objective': ['binary'],                        # Objective function (binary classification)
                                'metric': ['binary_logloss', 'auc']             # Evaluation metric
                        }
                },
                "rf" : {'file_name': 'rf_estimaters.json',
                        'Model': RandomForestClassifier(),
                        'param_dist' : {
                                'n_estimators': [100, 200, 500, 1000],  # Number of trees in the forest
                                'max_depth': [10, 20, 30, None],  # Maximum depth of the trees
                                'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
                                'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
                                'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
                                'bootstrap': [True, False],  # Whether to use bootstrap samples when building trees
                                'criterion': ['gini', 'entropy'],  # The function to measure the quality of a split
                        }
                }

        }

In [23]:
if not os.path.exists(Models_list[selected_model]['file_name']):
    print("file not exist")

    # Set up RandomizedSearchCV with 50 iterations
    Model = RandomizedSearchCV(estimator= Models_list[selected_model]['Model'], param_distributions= Models_list[selected_model]['param_dist'], 
                                   n_iter=50, cv=5, n_jobs=-1, scoring='precision', random_state=42)
    
    # Fit the model using RandomizedSearchCV
    Model.fit(X_train_scaled, y_train)
    Model.best_estimator_.random_state = 42
    
    estimaters = Model.best_estimator_.get_params()

    # Print the best parameters and the best score
    print("Best parameters found: ", Model.best_estimator_)
    print("Best cross-validation score: ", Model.best_score_)
    print(f"Saving estimaters at {Models_list[selected_model]['file_name']}")
    
    # Writing dictionary to a JSON file
    with open(Models_list[selected_model]['file_name'], 'w') as json_file:
        json.dump(estimaters, json_file, indent=4)  # 'indent' makes the JSON file readable

else:
    print("file exist")
    # Reading the JSON file as a dictionary
    with open(Models_list[selected_model]['file_name'], 'r') as json_file:
        estimaters = json.load(json_file)


file not exist


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Best parameters found:  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
Best cross-validation score:  1.0
Saving estimaters at xgb_estimaters.json


# Training the Model

In [24]:
# Initialize the model
if selected_model == 'xgb':
    Model = XGBClassifier(**estimaters) 
elif selected_model == 'rf':
    Model = RandomForestClassifier(**estimaters)
elif selected_model == 'lgb':
    Model = LGBMClassifier(**estimaters)
    
# Fit the model
Model.fit(X_train_scaled, y_train)

# Make predictions

In [26]:

y_pred = Model.predict(X_test_scaled)

# Display the evaluation results
print(f"Accuracy:- {round(accuracy_score(y_test, y_pred)*100,2)}%")
print(f"Precision:- {round(precision_score(y_test, y_pred, average="macro", zero_division=0)*100,2)}%")
print(f"Recall:- {round(recall_score(y_test, y_pred, average="macro", zero_division=0)*100,2)}%")

Accuracy:- 97.85%
Precision:- 98.92%
Recall:- 50.71%
