In [1]:
# importing the libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import ast

# Loading the dataset

In [2]:
# Setting dataset folder path
FOLDER_PATH = "../data/"

train_data = pd.read_csv(f"{FOLDER_PATH}/train.csv")
customer_data = pd.read_csv(f"{FOLDER_PATH}/customer.csv")
terminal_data = pd.read_csv(f"{FOLDER_PATH}/terminal.csv")

# EDA for customer_data

In [3]:
print(customer_data.dtypes)
print("datatype of available_terminals",type(customer_data['available_terminals'][0]))
# checking missing values in each columns
print(customer_data.isnull().sum())
customer_data.head()

CUSTOMER_ID              int64
x_customer_id          float64
y_customer_id          float64
mean_amount            float64
std_amount             float64
mean_nb_tx_per_day     float64
available_terminals     object
nb_terminals             int64
dtype: object
datatype of available_terminals <class 'str'>
CUSTOMER_ID            0
x_customer_id          0
y_customer_id          0
mean_amount            0
std_amount             0
mean_nb_tx_per_day     0
available_terminals    0
nb_terminals           0
dtype: int64


Unnamed: 0,CUSTOMER_ID,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals
0,0,54.88135,71.518937,62.262521,31.13126,2.179533,"[29, 87, 144, 241, 330, 858, 996, 1028, 1067, ...",22
1,1,42.36548,64.589411,46.570785,23.285393,3.567092,"[5, 160, 242, 378, 431, 475, 571, 762, 876, 93...",20
2,2,96.366276,38.344152,80.213879,40.106939,2.11558,"[316, 406, 447, 523, 968, 1200, 1318, 1365, 16...",10
3,3,56.804456,92.559664,11.748426,5.874213,0.348517,"[65, 94, 113, 364, 401, 433, 485, 651, 672, 77...",17
4,4,2.02184,83.261985,78.924891,39.462446,3.480049,"[372, 614, 774, 1362, 1446, 1564, 1637, 1939]",8


# EDA for terminal_data

In [4]:

print(terminal_data.dtypes)
# checking missing values in each columns
print(terminal_data.isnull().sum())
terminal_data.head()

TERMINAL_ID        int64
x_terminal_id    float64
y_terminal_id    float64
dtype: object
TERMINAL_ID      0
x_terminal_id    0
y_terminal_id    0
dtype: int64


Unnamed: 0,TERMINAL_ID,x_terminal_id,y_terminal_id
0,0,41.7022,72.032449
1,1,0.011437,30.233257
2,2,14.675589,9.233859
3,3,18.626021,34.556073
4,4,39.676747,53.881673


# EDA for train_data

In [5]:
print(train_data.dtypes)
# checking missing values in each columns
print(train_data.isnull().sum())
train_data.head()

TRANSACTION_ID      int64
TX_DATETIME        object
CUSTOMER_ID         int64
TERMINAL_ID         int64
TX_AMOUNT         float64
TX_FRAUD            int64
dtype: object
TRANSACTION_ID    0
TX_DATETIME       0
CUSTOMER_ID       0
TERMINAL_ID       0
TX_AMOUNT         0
TX_FRAUD          0
dtype: int64


Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,59383,2021-08-01 00:04:37,323,217,4.6,0
1,59384,2021-08-01 00:12:10,6,429,8.61,0
2,59385,2021-08-01 00:12:34,714,1011,64.0,0
3,59386,2021-08-01 00:15:40,266,1969,12.72,0
4,59387,2021-08-01 00:16:01,890,1482,98.88,0


# Merge dataset

In [6]:
train_data = train_data.merge(customer_data, how = "left", on = "CUSTOMER_ID").merge(terminal_data, how = "left", on = "TERMINAL_ID")
train_data.to_csv("train_dataset.csv",index=False)
train_data.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,"[51, 68, 208, 217, 293, 353, 534, 717, 773, 86...",19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,"[163, 172, 205, 429, 468, 607, 750, 786, 881, ...",16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,"[58, 799, 1011, 1021, 1228, 1347, 1443, 1462, ...",13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,"[27, 493, 584, 734, 820, 917, 1108, 1363, 1444...",15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,"[154, 177, 351, 444, 633, 739, 1018, 1056, 113...",20,62.417972,26.483666
