In [1]:
# General
import pandas as pd
import numpy as np

# Scikit-learn split, scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Imbalanced-learn
from imblearn.over_sampling import SMOTE

# Optional for verifying balance
from collections import Counter


# __1.Load feature-engineered data__

In [2]:
import pandas as pd

df = pd.read_csv('../data/processed/feature_engineered_Fraud_Data.csv')

print(df.head())
print(df.info())


   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class      ip_int  \
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0   732758368   
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0   350311387   
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  2621473820   
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0  3840542443   
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0   415583117   

         country  purchase_count  hour_of_day  day_of_week  time_since_signup  
0          Japan        

# __2.Drop or transform unneeded columns__

In [3]:
X = df.drop(['class', 'signup_time', 'purchase_time', 'user_id', 'device_id', 'ip_address'], axis=1)
y = df['class']


# 3. __Encode categoricals before split__

In [4]:
X = pd.get_dummies(X, columns=['source', 'browser', 'sex', 'country'], drop_first=True)


# __4.Train-Test split__

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print(X_train.shape, X_test.shape)
print(Counter(y_train), Counter(y_test))


(105778, 195) (45334, 195)
Counter({0: 95872, 1: 9906}) Counter({0: 41089, 1: 4245})


# __5. Handle class imbalance on training only__

In [6]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f'Before SMOTE: {Counter(y_train)}')
print(f'After SMOTE: {Counter(y_resampled)}')


Before SMOTE: Counter({0: 95872, 1: 9906})
After SMOTE: Counter({0: 95872, 1: 95872})


# __6.Scale numeric features__ 

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)


# __7.Save transformed sets__

In [8]:
import numpy as np

np.save('../data/processed/X_train.npy', X_train_scaled)
np.save('../data/processed/X_test.npy', X_test_scaled)
np.save('../data/processed/y_train.npy', y_resampled)
np.save('../data/processed/y_test.npy', y_test)
print("done")

done
