## Cleaning of the dataset

In [1]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
data = pd.read_csv("../raw_data/fraudTrain.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [2]:
data.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

### Missing data

In [3]:
round(data.isnull().sum().sort_values(ascending=False)/len(data), 2)

Unnamed: 0               0.0
trans_date_trans_time    0.0
cc_num                   0.0
merchant                 0.0
category                 0.0
amt                      0.0
first                    0.0
last                     0.0
gender                   0.0
street                   0.0
city                     0.0
state                    0.0
zip                      0.0
lat                      0.0
long                     0.0
city_pop                 0.0
job                      0.0
dob                      0.0
trans_num                0.0
unix_time                0.0
merch_lat                0.0
merch_long               0.0
is_fraud                 0.0
dtype: float64

### Defining features and target

In [4]:
# DEFINING THE TARGET
y = data['is_fraud'] # the price here is either expensive or cheap

# DEFINING THE FEATURES
X = data.drop(columns=['is_fraud'])

### Duplicates

In [5]:
number_of_duplicated_rows = X.duplicated().sum() # Check the number of duplicated rows
number_of_duplicated_rows

np.int64(0)

### Drop columns

The reasons we drop some columns :<br>
-useless for the model<br>
-equivalent to other columns<br>
-information contain in another column<br>

In [6]:
X.drop(columns=['Unnamed: 0','unix_time','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'city_pop', 'trans_num'],inplace=True)
X.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,lat,long,job,dob,merch_lat,merch_long
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,"Psychologist, counselling",1988-03-09,36.011293,-82.048315
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,Special educational needs teacher,1978-06-21,49.159047,-118.186462
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,Nature conservation officer,1962-01-19,43.150704,-112.154481
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,46.2306,-112.1138,Patent attorney,1967-01-12,47.034331,-112.561071
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,38.4207,-79.4629,Dance movement psychotherapist,1986-03-28,38.674999,-78.632459


In [7]:
X.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
gender                    object
lat                      float64
long                     float64
job                       object
dob                       object
merch_lat                float64
merch_long               float64
dtype: object

### Convert 'trans_date_trans_time' and 'dob' (Date of Birth) to datetime

In [8]:
import datetime
X['dob'] = pd.to_datetime(X['dob'])
X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
X.dtypes

trans_date_trans_time    datetime64[ns]
merchant                         object
category                         object
amt                             float64
gender                           object
lat                             float64
long                            float64
job                              object
dob                      datetime64[ns]
merch_lat                       float64
merch_long                      float64
dtype: object

### Categorical features

In [9]:
X_categorical=X.select_dtypes(include=['object'])
X_categorical.head()

Unnamed: 0,merchant,category,gender,job
0,"fraud_Rippin, Kub and Mann",misc_net,F,"Psychologist, counselling"
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,Special educational needs teacher
2,fraud_Lind-Buckridge,entertainment,M,Nature conservation officer
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,Patent attorney
4,fraud_Keeling-Crist,misc_pos,M,Dance movement psychotherapist


In [10]:
X_categorical['merchant'].value_counts()

merchant
fraud_Kilback LLC                       4403
fraud_Cormier LLC                       3649
fraud_Schumm PLC                        3634
fraud_Kuhn LLC                          3510
fraud_Boyer PLC                         3493
                                        ... 
fraud_Douglas, DuBuque and McKenzie      775
fraud_Treutel-King                       775
fraud_Medhurst, Labadie and Gottlieb     759
fraud_Reichert-Weissnat                  753
fraud_Hahn, Douglas and Schowalter       727
Name: count, Length: 693, dtype: int64

In [11]:
X_categorical['category'].value_counts()

category
gas_transport     131659
grocery_pos       123638
home              123115
shopping_pos      116672
kids_pets         113035
shopping_net       97543
entertainment      94014
food_dining        91461
personal_care      90758
health_fitness     85879
misc_pos           79655
misc_net           63287
grocery_net        45452
travel             40507
Name: count, dtype: int64

In [12]:
X_categorical['job'].value_counts()

job
Film/video editor             9779
Exhibition designer           9199
Naval architect               8684
Surveyor, land/geomatics      8680
Materials engineer            8270
                              ... 
Veterinary surgeon               8
Information officer              8
Contracting civil engineer       7
Ship broker                      7
Warehouse manager                7
Name: count, Length: 494, dtype: int64

### Note : We will drop 'job' and 'merchant' because they have too many classes. If we decide to use CatBoost, we could keep them. If we have time we could group jobs in different categories

In [13]:
X_categorical.drop(columns=['job','merchant'],inplace=True)
X_categorical.head()

Unnamed: 0,category,gender
0,misc_net,F
1,grocery_pos,F
2,entertainment,M
3,gas_transport,M
4,misc_pos,M


In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).set_output(transform='pandas')

# Fitting it to the categorical features
ohe.fit(X_categorical)

# Storing the encoded features
X_categorical_encoded = ohe.transform(X_categorical)

X_categorical_encoded

Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1296671,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1296672,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1296673,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Numerical features

In [15]:
X_numerical=X.select_dtypes(exclude=['object'])
X_numerical.head()

Unnamed: 0,trans_date_trans_time,amt,lat,long,dob,merch_lat,merch_long
0,2019-01-01 00:00:18,4.97,36.0788,-81.1781,1988-03-09,36.011293,-82.048315
1,2019-01-01 00:00:44,107.23,48.8878,-118.2105,1978-06-21,49.159047,-118.186462
2,2019-01-01 00:00:51,220.11,42.1808,-112.262,1962-01-19,43.150704,-112.154481
3,2019-01-01 00:01:16,45.0,46.2306,-112.1138,1967-01-12,47.034331,-112.561071
4,2019-01-01 00:03:06,41.96,38.4207,-79.4629,1986-03-28,38.674999,-78.632459


Here we change 'dob' into 'age' and 'trans_date_trans_time' in 'month', 'day', 'hour' (we don't need the year for our model).
We drop 'dob' and 'trans_date_trans_time'

In [16]:
X_numerical['age'] = (X['trans_date_trans_time'] - X_numerical['dob']).dt.days / 365.25
X_numerical['trans_month']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.month
X_numerical['trans_day']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.day
X_numerical['trans_hour']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.hour
X_numerical.tail(5)

Unnamed: 0,trans_date_trans_time,amt,lat,long,dob,merch_lat,merch_long,age,trans_month,trans_day,trans_hour
1296670,2020-06-21 12:12:08,15.56,37.7175,-112.4777,1961-11-24,36.841266,-111.690765,58.57358,6,21,12
1296671,2020-06-21 12:12:19,51.7,39.2667,-77.5101,1979-12-11,38.906881,-78.246528,40.528405,6,21,12
1296672,2020-06-21 12:12:32,105.93,32.9396,-105.8189,1967-08-30,33.619513,-105.130529,52.810404,6,21,12
1296673,2020-06-21 12:13:36,74.9,43.3526,-102.5411,1980-08-18,42.78894,-103.24116,39.841205,6,21,12
1296674,2020-06-21 12:13:37,4.3,45.8433,-113.8748,1995-08-16,46.565983,-114.18611,24.848734,6,21,12


In [17]:
X_numerical.drop(columns=['trans_date_trans_time','dob'],inplace=True)
X_numerical.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,age,trans_month,trans_day,trans_hour
0,4.97,36.0788,-81.1781,36.011293,-82.048315,30.814511,1,1,0
1,107.23,48.8878,-118.2105,49.159047,-118.186462,40.531143,1,1,0
2,220.11,42.1808,-112.262,43.150704,-112.154481,56.950034,1,1,0
3,45.0,46.2306,-112.1138,47.034331,-112.561071,51.969884,1,1,0
4,41.96,38.4207,-79.4629,38.674999,-78.632459,32.76386,1,1,0


### Note : We will use RobustScaler for 'amt', 'age'. We transform 'trans_month', 'trans_day', 'trans_hour' with cycling encoding. We let 'lat', 'lon', 'merch_lat', 'merch_lon' like this. They work well for Random Forest, XGBoost, CatBoost models.

In [18]:
# Instantiating a RobustScaler
robust_scaler = RobustScaler().set_output(transform='pandas')
X_numerical['amt']= robust_scaler.fit_transform(X_numerical[['amt']])
X_numerical['age']= robust_scaler.fit_transform(X_numerical[['age']])
X_numerical.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,age,trans_month,trans_day,trans_hour
0,-0.57899,36.0788,-81.1781,36.011293,-82.048315,-0.537592,1,1,0
1,0.812491,48.8878,-118.2105,49.159047,-118.186462,-0.140524,1,1,0
2,2.348483,42.1808,-112.262,43.150704,-112.154481,0.530432,1,1,0
3,-0.03429,46.2306,-112.1138,47.034331,-112.561071,0.326919,1,1,0
4,-0.075657,38.4207,-79.4629,38.674999,-78.632459,-0.457932,1,1,0


In [19]:
#Use cyclic (sin/cos) encoding for month, day, weekday
X_numerical["trans_month_sin"] = np.sin(2 * np.pi * X_numerical["trans_month"] / 12)
X_numerical["trans_month_cos"] = np.cos(2 * np.pi * X_numerical["trans_month"] / 12)

X_numerical["trans_day_sin"] = np.sin(2 * np.pi * X_numerical["trans_day"] / 31)
X_numerical["trans_day_cos"] = np.cos(2 * np.pi * X_numerical["trans_day"] / 31)

X_numerical["trans_hour_sin"] = np.sin(2 * np.pi * X_numerical["trans_hour"] / 24)
X_numerical["trans_hour_cos"] = np.cos(2 * np.pi * X_numerical["trans_hour"] / 24)

In [29]:
#Drop'trans_month','trans_day','trans_hour'
X_numerical.drop(columns=['trans_month','trans_day','trans_hour'],inplace=True)

### Concatenation

In [30]:
X_preprocessed = pd.concat([X_numerical, X_categorical_encoded], axis=1)
X_preprocessed.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,age,trans_month_sin,trans_month_cos,trans_day_sin,trans_day_cos,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
0,-0.57899,36.0788,-81.1781,36.011293,-82.048315,-0.537592,0.5,0.866025,0.201299,0.97953,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.812491,48.8878,-118.2105,49.159047,-118.186462,-0.140524,0.5,0.866025,0.201299,0.97953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.348483,42.1808,-112.262,43.150704,-112.154481,0.530432,0.5,0.866025,0.201299,0.97953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.03429,46.2306,-112.1138,47.034331,-112.561071,0.326919,0.5,0.866025,0.201299,0.97953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.075657,38.4207,-79.4629,38.674999,-78.632459,-0.457932,0.5,0.866025,0.201299,0.97953,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## This is a first version. cf Note to see how to improve