#Support Vector Machine for Credit Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk
from pprint import pprint

In [2]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.3f} (+/-{1:.3f})').format(np.mean(scores), sem(scores))

In [5]:
def test_svm(train_size, svm, df):
    train, test = sk.cross_validation.train_test_split(df, train_size=0.1)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    svm.fit(X, y)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, svm.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(svm.score(X_test, y_test))

In [6]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svc = SVC(kernel='rbf', C=1, gamma=0.001, probability=True)
test_svm(0.01, svc, df_clean)

roc auc score: 0.607887276563
Accuracy: 0.933420481892


while tweaking the parameters for svm, we noticed an increase of preciscion proportional to an increased number of training samples. However, processing 5% samples, already takes abaout 10 minutes on my machine.

Anyhow, this way of testing is not that efficient at all, we still don't know which params to use, to get a descent result. So we will try a GridSearch. We took the parameter range from a tutorial.

##Grid Search

In [22]:
def svm_grid_search(train_size, svc_params, df):
    train, test = sk.cross_validation.train_test_split(df, train_size=train_size)
    X_train = train.drop('SeriousDlqin2yrs', axis=1)
    y_train = train.SeriousDlqin2yrs 
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
    %time gs_svc.fit(X_train, y_train)
    scv=gs_svc.best_estimator_
    svc.fit(X_train, y_train)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, svc.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(svc.score(X_test, y_test))

In [35]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
    'probability': [True]
}

##Cleaned Data

In [40]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_clean)

CPU times: user 431 ms, sys: 49.6 ms, total: 481 ms
Wall time: 4.78 s
roc auc score: 0.480006541237
Accuracy: 0.933200540176


##Cropped Data

In [41]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_cropped)

CPU times: user 346 ms, sys: 45.6 ms, total: 392 ms
Wall time: 2.94 s
roc auc score: 0.495694861476
Accuracy: 0.930467719855


##Oversampled Data

In [43]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_oversampled)

CPU times: user 1min 22s, sys: 1.11 s, total: 1min 24s
Wall time: 8min 14s
roc auc score: 0.656666283208
Accuracy: 0.60728024207


## Undersampled Data

In [44]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_undersampled)

CPU times: user 227 ms, sys: 45.3 ms, total: 273 ms
Wall time: 387 ms
roc auc score: 0.511922787372
Accuracy: 0.519731673415
