In [None]:
from __future__ import division
from datetime import timedelta
from imblearn.over_sampling import SMOTE 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from time import time
import sys

from lib.data import load_data

import matplotlib
matplotlib.rcParams['figure.figsize'] = [15.0, 10.0]

In [None]:
start = time()
X, y = load_data('data.csv')
end = time()
print 'Data loading done in', timedelta(seconds=end - start)

In [None]:
data_size = 80000
start = time()
X_normal = X[y == 0]
X_fraud = X[y == 1]
y_normal = y[y == 0]
y_fraud = y[y == 1]
idx = np.random.randint(y_normal.size, size=data_size)
X = np.concatenate((X_normal[idx], X_fraud), axis=0)
y = np.concatenate((y_normal[idx], y_fraud), axis=0)
end = time()
print 'Data reduction done in', timedelta(seconds=end - start)

In [None]:
start = time()
sm = SMOTE(random_state=0)
X_sm, y_sm = sm.fit_sample(X, y)
end = time()
print 'SMOTE done in', timedelta(seconds=end - start)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
start = time()
X_temp, X_test, y_temp, y_test = train_test_split(X_sm, y_sm, test_size=0.1, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(X_temp, y_temp, test_size=0.12, random_state=0)
end = time()
print 'Data splitting done in', timedelta(seconds=end - start)

In [None]:
start = time()
C = 1000
train_sizes = np.linspace(0.01, 1, 20)
clf = LogisticRegression(C=C)
train_sizes, train_scores, cv_scores = learning_curve(clf, X_temp, y_temp,
                                                      scoring='accuracy',
                                                      train_sizes=train_sizes, cv=10)
end = time()
print 'Learning curve done in', timedelta(seconds=end - start)

In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
cv_scores_mean = np.mean(cv_scores, axis=1)
cv_scores_std = np.std(cv_scores, axis=1)

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                 alpha=0.1, color='r')
plt.plot(train_sizes, train_scores_mean, ':or', label='Training score')
plt.fill_between(train_sizes, cv_scores_mean - cv_scores_std, cv_scores_mean + cv_scores_std,
                 alpha=0.1, color='g')
plt.plot(train_sizes, cv_scores_mean, ':og', label='Cross-validation score')
plt.xlabel('Number of training examples')
plt.ylabel('Score')
plt.legend()
plt.show()