#Support Vector Machine for Credit Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
from pprint import pprint

In [2]:
from scipy.stats import sem

def mean_score(text, scores):
    return (str(text) + ' {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [3]:
def test_svm(train_size, svm, df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        X_test = scaler.fit_transform(X_test)
        svm.fit(X, y)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, svm.predict_proba(X_test).T[1]))
        accs.append(svm.score(X_test, y_test))
    print(mean_score('roc auc score:', auc_scores))
    print(mean_score('accuracy:', accs))

In [4]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svc = SVC(kernel='rbf', C=1, gamma=0.001, probability=True)
test_svm(0.01, svc, df_clean, 10)

roc auc score: 0.64876678 (+/-0.04497)
accuracy: 0.93325388 (+/-0.00004)


while tweaking the parameters for svm, we noticed an increase of preciscion proportional to an increased number of training samples. However, processing 5% samples, already takes abaout 10 minutes on my machine.

Anyhow, this way of testing is not that efficient at all, we still don't know which params to use, to get a descent result. So we will try a GridSearch. We took the parameter range from a tutorial.

##Grid Search

In [5]:
def svm_grid_search(train_size, svc_params, df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)
        X_train = train.drop('SeriousDlqin2yrs', axis=1)
        y_train = train.SeriousDlqin2yrs 
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        X_test = scaler.fit_transform(X_test)
        gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
        gs_svc.fit(X_train, y_train)
        scv=gs_svc.best_estimator_
        svc.fit(X_train, y_train)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, svc.predict_proba(X_test).T[1]))
        accs.append(svc.score(X_test, y_test))
    print(mean_score('roc auc score:', auc_scores))
    print(mean_score('accuracy:', accs))

In [6]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
    'probability': [True]
}

##Cleaned Data

In [8]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_clean, 10)

CPU times: user 672 ms, sys: 39.7 ms, total: 712 ms
Wall time: 5.37 s
CPU times: user 304 ms, sys: 55.5 ms, total: 359 ms
Wall time: 4.62 s
CPU times: user 789 ms, sys: 62.2 ms, total: 851 ms
Wall time: 5.29 s
CPU times: user 390 ms, sys: 56.1 ms, total: 446 ms
Wall time: 5.1 s
CPU times: user 315 ms, sys: 56.4 ms, total: 372 ms
Wall time: 5.16 s
CPU times: user 415 ms, sys: 59.8 ms, total: 475 ms
Wall time: 3.38 s
CPU times: user 449 ms, sys: 50.8 ms, total: 500 ms
Wall time: 2.49 s
CPU times: user 471 ms, sys: 52.7 ms, total: 524 ms
Wall time: 3.12 s
CPU times: user 270 ms, sys: 54.3 ms, total: 324 ms
Wall time: 2.9 s
CPU times: user 242 ms, sys: 36.3 ms, total: 279 ms
Wall time: 2.65 s
roc auc score: 0.73867272 (+/-0.01546)
accuracy: 0.93326604 (+/-0.00003)


##Cropped Data

In [52]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_cropped, 10)

CPU times: user 310 ms, sys: 37 ms, total: 347 ms
Wall time: 2.03 s
roc auc score: 0.751016439669
Accuracy: 0.930694482938


##Oversampled Data

In [53]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_oversampled, 10)

CPU times: user 2.29 s, sys: 49.5 ms, total: 2.34 s
Wall time: 14.4 s
roc auc score: 0.652019125256
Accuracy: 0.602464872589


## Undersampled Data

In [54]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_undersampled, 10)

CPU times: user 174 ms, sys: 25.5 ms, total: 199 ms
Wall time: 272 ms
roc auc score: 0.715977678677
Accuracy: 0.49972804738
