# Preprocessing

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
data_path = Path("../Dataset/")

In [27]:
# training_data = pd.read_csv(data_path/"Candidate_tech_evaluation_candidate_copy_data science_fraud.csv").drop(columns=['Unnamed: 0'])
training_data = pd.read_csv(data_path/"training_data_with_ipCountry.csv")

## Time delta in minutes

In [28]:
training_data['signup_time_dt'] = pd.to_datetime(training_data['signup_time'])
training_data['purchase_time_dt'] = pd.to_datetime(training_data['purchase_time'])
training_data['days_signup_purchase'] = training_data['purchase_time_dt'] - training_data['signup_time_dt']
training_data['minutes_signup_purchase'] = training_data['days_signup_purchase'].dt.total_seconds()/60

## Device ID counting

In [29]:
device_map = training_data['device_id'].value_counts().to_dict()
training_data['device_count'] = training_data['device_id'].map(device_map)

## Country
To input country data into machine learning model, one hot encoding should be performed.

In [43]:
from sklearn.preprocessing import OneHotEncoder

In [46]:
enc = OneHotEncoder()
country_one_hot = pd.DataFrame(enc.fit_transform(training_data[['ip_country']]).toarray())

In [47]:
country_one_hot.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,167,168,169,170,171,172,173,174,175,176
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Base dataset

In [48]:
base_data = training_data[['user_id', 'minutes_signup_purchase', 'device_count', 'class']].join(country_one_hot)

In [49]:
base_data.head()

Unnamed: 0,user_id,minutes_signup_purchase,device_count,class,0,1,2,3,4,5,...,167,168,169,170,171,172,173,174,175,176
0,285108,82661.0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,131009,113304.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,328855,36689.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,229053,2693.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,108439,85995.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
base_data.shape

(120000, 181)

# Make Balanced dataset

In [51]:
base_data_1 = base_data.loc[base_data['class'] == 1]
base_data_1.shape

(11265, 181)

Since number of fraud transaction is 11265, dataset is imbalanced.  
Training on such dataset could be poor performance.

Sample non-fraud records as many as fraud records.

In [52]:
base_data_0 = base_data.sample(base_data_1.shape[0], random_state=42)
base_data_0.shape

(11265, 181)

In [53]:
base_data_balance = pd.concat([base_data_0, base_data_1])
base_data_balance.shape

(22530, 181)

# Data split

In [10]:
from sklearn.model_selection import train_test_split

In [57]:
X = base_data_balance[[c for c in base_data.columns if c not in ['class']]]
y = base_data_balance[['class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

In [58]:
X_train.shape, y_train.shape

((15095, 180), (15095, 1))

In [59]:
X_test.shape, y_test.shape

((7435, 180), (7435, 1))

# Save dataset as CSV

In [60]:
X_train.to_csv('../Dataset/train_X.csv', index=False)
y_train.to_csv('../Dataset/train_y.csv', index=False)
X_test.to_csv('../Dataset/test_X.csv', index=False)
y_test.to_csv('../Dataset/test_y.csv', index=False)