# Preprocessing

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
data_path = Path("../Dataset/")

In [4]:
# training_data = pd.read_csv(data_path/"Candidate_tech_evaluation_candidate_copy_data science_fraud.csv").drop(columns=['Unnamed: 0'])
training_data = pd.read_csv(data_path/"training_data_with_ipCountry.csv")

## Time delta in minutes

In [5]:
training_data['signup_time_dt'] = pd.to_datetime(training_data['signup_time'])
training_data['purchase_time_dt'] = pd.to_datetime(training_data['purchase_time'])
training_data['days_signup_purchase'] = training_data['purchase_time_dt'] - training_data['signup_time_dt']
training_data['minutes_signup_purchase'] = training_data['days_signup_purchase'].dt.total_seconds()/60

## Device ID counting

In [6]:
device_map = training_data['device_id'].value_counts().to_dict()
training_data['device_count'] = training_data['device_id'].map(device_map)

## Country
To input country data into machine learning model, one hot encoding should be performed.

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
enc = OneHotEncoder()
country_one_hot = pd.DataFrame(enc.fit_transform(training_data[['ip_country']]).toarray())
country_one_hot = country_one_hot.add_prefix('country_')

In [30]:
country_one_hot.head()

Unnamed: 0,country_0,country_1,country_2,country_3,country_4,country_5,country_6,country_7,country_8,country_9,...,country_167,country_168,country_169,country_170,country_171,country_172,country_173,country_174,country_175,country_176
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Time range

In [12]:
def set_time_group(time):
    if time in range(6, 12):
        return 'morning'
    elif time in range(12, 18):
        return 'afternoon'
    elif time in range(18, 24):
        return 'evening'
    else:
        return 'night'

In [14]:
training_data['signup_time_h_dt'] = training_data['signup_time_dt'].dt.hour
training_data['purchase_time_h_dt'] = training_data['purchase_time_dt'].dt.hour

In [15]:
training_data['sign_time_range'] = training_data['signup_time_h_dt'].apply(set_time_group)
training_data['purchase_time_range'] = training_data['purchase_time_h_dt'].apply(set_time_group)

In [31]:
enc = OneHotEncoder()
sign_time_one_hot = pd.DataFrame(enc.fit_transform(training_data[['sign_time_range']]).toarray())
sign_time_one_hot = sign_time_one_hot.add_prefix('signTime_')

In [32]:
sign_time_one_hot.head()

Unnamed: 0,signTime_0,signTime_1,signTime_2,signTime_3
0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0


In [33]:
enc = OneHotEncoder()
purchase_time_one_hot = pd.DataFrame(enc.fit_transform(training_data[['sign_time_range']]).toarray())
purchase_time_one_hot = purchase_time_one_hot.add_prefix('purchaseTime_')

In [34]:
purchase_time_one_hot.head()

Unnamed: 0,purchaseTime_0,purchaseTime_1,purchaseTime_2,purchaseTime_3
0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0


## Source

In [35]:
enc = OneHotEncoder()
source_one_hot = pd.DataFrame(enc.fit_transform(training_data[['source']]).toarray())
source_one_hot = source_one_hot.add_prefix('source_')

In [36]:
source_one_hot.head()

Unnamed: 0,source_0,source_1,source_2
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


# Base dataset

In [37]:
base_data = training_data[['user_id', 'minutes_signup_purchase', 'device_count', 
                           'purchase_value', 'class']].join(sign_time_one_hot)
base_data = base_data.join(purchase_time_one_hot)
base_data = base_data.join(source_one_hot)
base_data = base_data.join(country_one_hot)

In [38]:
base_data.head()

Unnamed: 0,user_id,minutes_signup_purchase,device_count,purchase_value,class,signTime_0,signTime_1,signTime_2,signTime_3,purchaseTime_0,...,country_167,country_168,country_169,country_170,country_171,country_172,country_173,country_174,country_175,country_176
0,285108,82661.0,2,31,0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,131009,113304.0,1,31,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,328855,36689.0,1,16,0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,229053,2693.0,1,29,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,108439,85995.0,1,26,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
base_data.shape

(120000, 193)

# Make Balanced dataset

create synthetically similar dataset

In [41]:
from imblearn.over_sampling import SMOTE, ADASYN

In [43]:
X_data = base_data[[c for c in base_data.columns if c not in ['user_id', 'class']]]
y_data = base_data[['class']]

In [54]:
X_resampled, y_resampled = SMOTE(random_state=42, k_neighbors=3).fit_resample(X_data, y_data)

In [55]:
X_resampled.shape, y_resampled.shape

((217470, 191), (217470, 1))

In [56]:
y_resampled.value_counts()

class
0        108735
1        108735
Name: count, dtype: int64

# Data split

In [49]:
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42, shuffle=True)

In [58]:
X_train.shape, y_train.shape

((145704, 191), (145704, 1))

In [59]:
X_test.shape, y_test.shape

((71766, 191), (71766, 1))

# Save dataset as CSV

In [60]:
X_train.to_csv('../Dataset/SMOTE_train_X.csv', index=False)
y_train.to_csv('../Dataset/SMOTE_train_y.csv', index=False)
X_test.to_csv('../Dataset/SMOTE_test_X.csv', index=False)
y_test.to_csv('../Dataset/SMOTE_test_y.csv', index=False)