In [319]:
# importing the libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import ast

In [320]:
# Setting dataset folder path
FOLDER_PATH = "../data/"

# loading the dataset
train_data = pd.read_csv(f"{FOLDER_PATH}/train.csv")
customer_data = pd.read_csv(f"{FOLDER_PATH}/customer.csv")
terminal_data = pd.read_csv(f"{FOLDER_PATH}/terminal.csv")

In [321]:
# EDA for train_data
print(customer_data.dtypes)
print("datatype of available_terminals ",type(customer_data['available_terminals'][0]))

# checking missing values in each columns
print(customer_data.isnull().sum())

#fixing available_terminals datatype
customer_data['available_terminals'] = customer_data['available_terminals'].apply(ast.literal_eval)
print("datatype of available_terminals after conversion",type(customer_data['available_terminals'][0]))

customer_data.head()


CUSTOMER_ID              int64
x_customer_id          float64
y_customer_id          float64
mean_amount            float64
std_amount             float64
mean_nb_tx_per_day     float64
available_terminals     object
nb_terminals             int64
dtype: object
datatype of available_terminals  <class 'str'>
CUSTOMER_ID            0
x_customer_id          0
y_customer_id          0
mean_amount            0
std_amount             0
mean_nb_tx_per_day     0
available_terminals    0
nb_terminals           0
dtype: int64
datatype of available_terminals after conversion <class 'list'>


Unnamed: 0,CUSTOMER_ID,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals
0,0,54.88135,71.518937,62.262521,31.13126,2.179533,"[29, 87, 144, 241, 330, 858, 996, 1028, 1067, ...",22
1,1,42.36548,64.589411,46.570785,23.285393,3.567092,"[5, 160, 242, 378, 431, 475, 571, 762, 876, 93...",20
2,2,96.366276,38.344152,80.213879,40.106939,2.11558,"[316, 406, 447, 523, 968, 1200, 1318, 1365, 16...",10
3,3,56.804456,92.559664,11.748426,5.874213,0.348517,"[65, 94, 113, 364, 401, 433, 485, 651, 672, 77...",17
4,4,2.02184,83.261985,78.924891,39.462446,3.480049,"[372, 614, 774, 1362, 1446, 1564, 1637, 1939]",8


In [322]:
# EDA for train_data
print(terminal_data.dtypes)

# checking missing values in each columns
print(terminal_data.isnull().sum())

terminal_data.head()

TERMINAL_ID        int64
x_terminal_id    float64
y_terminal_id    float64
dtype: object
TERMINAL_ID      0
x_terminal_id    0
y_terminal_id    0
dtype: int64


Unnamed: 0,TERMINAL_ID,x_terminal_id,y_terminal_id
0,0,41.7022,72.032449
1,1,0.011437,30.233257
2,2,14.675589,9.233859
3,3,18.626021,34.556073
4,4,39.676747,53.881673


In [323]:
# EDA for train_data
print(train_data.dtypes)

# checking missing values in each columns
print(train_data.isnull().sum())

# fixing datetime column datatype
train_data['TX_DATETIME'] = pd.to_datetime(train_data['TX_DATETIME'])

train_data.head()

TRANSACTION_ID      int64
TX_DATETIME        object
CUSTOMER_ID         int64
TERMINAL_ID         int64
TX_AMOUNT         float64
TX_FRAUD            int64
dtype: object
TRANSACTION_ID    0
TX_DATETIME       0
CUSTOMER_ID       0
TERMINAL_ID       0
TX_AMOUNT         0
TX_FRAUD          0
dtype: int64


Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,59383,2021-08-01 00:04:37,323,217,4.6,0
1,59384,2021-08-01 00:12:10,6,429,8.61,0
2,59385,2021-08-01 00:12:34,714,1011,64.0,0
3,59386,2021-08-01 00:15:40,266,1969,12.72,0
4,59387,2021-08-01 00:16:01,890,1482,98.88,0


# Merge dataset

In [324]:
train_data = train_data.merge(customer_data, how = "left", on = "CUSTOMER_ID").merge(terminal_data, how = "left", on = "TERMINAL_ID")
train_data.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,"[51, 68, 208, 217, 293, 353, 534, 717, 773, 86...",19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,"[163, 172, 205, 429, 468, 607, 750, 786, 881, ...",16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,"[58, 799, 1011, 1021, 1228, 1347, 1443, 1462, ...",13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,"[27, 493, 584, 734, 820, 917, 1108, 1363, 1444...",15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,"[154, 177, 351, 444, 633, 739, 1018, 1056, 113...",20,62.417972,26.483666


Distance_km feature

In [325]:
import math
# Define the Haversine function to calculate distance between two coordinates
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c

    return distance

# Apply the Haversine function row-wise
train_data['Distance_km'] = train_data.apply(lambda row: haversine(row['x_customer_id'], row['y_customer_id'], row['x_terminal_id'], row['y_terminal_id']), axis=1).round()

isAllowedTerminal feature

In [326]:
train_data['isAllowedTerminal'] = train_data.apply(lambda row: int(row['TERMINAL_ID'] in row['available_terminals']), axis=1)

Datetime feature

In [327]:
train_data['year'] = train_data['TX_DATETIME'].dt.year
train_data['month'] = train_data['TX_DATETIME'].dt.month
train_data['day'] = train_data['TX_DATETIME'].dt.day

# Feature selection

In [328]:
features = ['TRANSACTION_ID', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'std_amount', 'mean_nb_tx_per_day', 'x_customer_id', 'y_customer_id', 'x_terminal_id', 'Distance_km', 'isAllowedTerminal', 'year', 'month', 'day']
x_train_data = train_data[features]
y_train_data = train_data['TX_FRAUD']
train_data.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals,x_terminal_id,y_terminal_id,Distance_km,isAllowedTerminal,year,month,day
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,"[51, 68, 208, 217, 293, 353, 534, 717, 773, 86...",19,80.316334,40.239252,468.0,1,2021,8,1
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,"[163, 172, 205, 429, 468, 607, 750, 786, 881, ...",16,15.172487,63.912685,372.0,1,2021,8,1
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,"[58, 799, 1011, 1021, 1228, 1347, 1443, 1462, ...",13,74.196424,98.288079,149.0,1,2021,8,1
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,"[27, 493, 584, 734, 820, 917, 1108, 1363, 1444...",15,51.950635,6.563484,153.0,1,2021,8,1
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,"[154, 177, 351, 444, 633, 739, 1018, 1056, 113...",20,62.417972,26.483666,255.0,1,2021,8,1


# spliting train dataset

In [329]:
x_train, x_val, y_train, y_val = train_test_split(x_train_data, y_train_data, train_size=0.2, random_state=42)

# Scale features (important for some models)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# model selection

In [330]:
# LogisticRegression model
logiReg_model = LogisticRegression()

# Initialize the LightGBM classifier
lgb_model = lgb.LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight="balanced")

# List of models

Models = [logiReg_model, lgb_model, xgb_model, rf_model]


# training the model

In [331]:
for i in Models:
    i.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 1368, number of negative: 56878
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 58246, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.023487 -> initscore=-3.727559
[LightGBM] [Info] Start training from score -3.727559


# Make predictions

In [332]:
for i in Models:
    y_pred = i.predict(x_val)
    print(i)
    print(f"Accuracy:- {round(accuracy_score(y_val, y_pred)*100,2)}%")
    print(f"Precision:- {round(precision_score(y_val, y_pred, average="macro", zero_division=0)*100,2)}%")
    print(f"Recall:- {round(recall_score(y_val, y_pred, average="macro", zero_division=0)*100,2)}%")
    print("\n")

LogisticRegression()
Accuracy:- 97.76%
Precision:- 48.88%
Recall:- 50.0%


LGBMClassifier(max_depth=5, random_state=42)
Accuracy:- 97.76%
Precision:- 73.35%
Recall:- 50.42%


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
Accuracy:- 97.77%
Precision:- 81.95%
Recall:- 50.39%


RandomForestClassif