#Support Vector Machine for Credit Data

In [45]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
from pprint import pprint

In [46]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.3f} (+/-{1:.3f})').format(np.mean(scores), sem(scores))

In [47]:
def test_svm(train_size, svm, df):
    train, test = sk.cross_validation.train_test_split(df, train_size=0.1)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    X_test = scaler.fit_transform(X_test)
    svm.fit(X, y)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, svm.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(svm.score(X_test, y_test))

In [48]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svc = SVC(kernel='rbf', C=1, gamma=0.001, probability=True)
test_svm(0.01, svc, df_clean)

roc auc score: 0.687750916462
Accuracy: 0.9335096111


while tweaking the parameters for svm, we noticed an increase of preciscion proportional to an increased number of training samples. However, processing 5% samples, already takes abaout 10 minutes on my machine.

Anyhow, this way of testing is not that efficient at all, we still don't know which params to use, to get a descent result. So we will try a GridSearch. We took the parameter range from a tutorial.

##Grid Search

In [49]:
def svm_grid_search(train_size, svc_params, df):
    train, test = sk.cross_validation.train_test_split(df, train_size=train_size)
    X_train = train.drop('SeriousDlqin2yrs', axis=1)
    y_train = train.SeriousDlqin2yrs 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    X_test = scaler.fit_transform(X_test)
    gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
    %time gs_svc.fit(X_train, y_train)
    scv=gs_svc.best_estimator_
    svc.fit(X_train, y_train)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, svc.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(svc.score(X_test, y_test))

In [50]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
    'probability': [True]
}

##Cleaned Data

In [51]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_clean)

CPU times: user 248 ms, sys: 45.1 ms, total: 294 ms
Wall time: 2.61 s
roc auc score: 0.382724914732
Accuracy: 0.933220796759


##Cropped Data

In [52]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_cropped)

CPU times: user 310 ms, sys: 37 ms, total: 347 ms
Wall time: 2.03 s
roc auc score: 0.751016439669
Accuracy: 0.930694482938


##Oversampled Data

In [53]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_oversampled)

CPU times: user 2.29 s, sys: 49.5 ms, total: 2.34 s
Wall time: 14.4 s
roc auc score: 0.652019125256
Accuracy: 0.602464872589


## Undersampled Data

In [54]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_undersampled)

CPU times: user 174 ms, sys: 25.5 ms, total: 199 ms
Wall time: 272 ms
roc auc score: 0.715977678677
Accuracy: 0.49972804738
