### Oversampling with SMOTE
1. disadvantages of under-sampling: lose useful information.
2. disadvantages of over-sampling with replacement: overfitting.
3. advantages of SMOTE (Synthetic Minority Over-sampling TEchnique): avoid overfitting. 

In [3]:
import pandas as pd
import numpy as np
import math
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import random



In [4]:
# read data
df = pd.read_csv('creditcard.csv')

# Data transformation
df['Time'] = df['Time']/3600
df['Amount'] = [math.log(x+1) for x in df['Amount']]

# Only keep the first 20 PCs
train_cols = list(df.columns.values[:21])
train_cols.append('Amount')

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(df[train_cols], df['Class'], test_size=0.3, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((199364, 22), (199364,), (85443, 22), (85443,))

In [5]:
# Oversampling with SMOTE
smote = SMOTE(sampling_strategy = 1, random_state=2) # sample strategy = 1 means class ratio is 1:1
X_train_smote, y_train_smote = smote.fit_sample(X = X_train, y = y_train.values)

# Shuffle data
ind_list = [i for i in range(len(y_train_smote))]
random.shuffle(ind_list)
X_train_smote = X_train_smote[ind_list]
y_train_smote = y_train_smote[ind_list]

X_train_smote = pd.DataFrame(X_train_smote, columns = train_cols)