# Preprocessing

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
data_path = Path("../Dataset/")

In [3]:
training_data = pd.read_csv(data_path/"Candidate_tech_evaluation_candidate_copy_data science_fraud.csv").drop(columns=['Unnamed: 0'])

## Time delta in minutes

In [7]:
training_data['signup_time_dt'] = pd.to_datetime(training_data['signup_time'])
training_data['purchase_time_dt'] = pd.to_datetime(training_data['purchase_time'])
training_data['days_signup_purchase'] = training_data['purchase_time_dt'] - training_data['signup_time_dt']
training_data['minutes_signup_purchase'] = training_data['days_signup_purchase'].dt.total_seconds()/60

## Device ID counting

In [8]:
device_map = training_data['device_id'].value_counts().to_dict()
training_data['device_count'] = training_data['device_id'].map(device_map)

# Make Balanced dataset

In [9]:
base_data = training_data[['user_id', 'minutes_signup_purchase', 'device_count', 'class']]

In [15]:
base_data.shape

(120000, 4)

In [14]:
base_data_1 = base_data.loc[base_data['class'] == 1]
base_data_1.shape

(11265, 4)

Since number of fraud transaction is 11265, dataset is imbalanced.  
Training on such dataset could be poor performance.

Sample non-fraud records as many as fraud records.

In [20]:
base_data_0 = base_data.sample(base_data_1.shape[0], random_state=42)
base_data_0.shape

(11265, 4)

In [21]:
base_data_balance = pd.concat([base_data_0, base_data_1])
base_data_balance.shape

(22530, 4)

# Data split

In [10]:
from sklearn.model_selection import train_test_split

In [22]:
X = base_data_balance[['user_id', 'minutes_signup_purchase', 'device_count']]
y = base_data_balance[['class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

In [23]:
X_train.shape, y_train.shape

((15095, 3), (15095, 1))

In [24]:
X_test.shape, y_test.shape

((7435, 3), (7435, 1))

# Save dataset as CSV

In [26]:
X_train.to_csv('../Dataset/train_X.csv', index=False)
y_train.to_csv('../Dataset/train_y.csv', index=False)
X_test.to_csv('../Dataset/test_X.csv', index=False)
y_test.to_csv('../Dataset/test_y.csv', index=False)