#Support Vector Machine for Credit Data

In [9]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
from pprint import pprint

In [10]:
from scipy.stats import sem

def mean_score(text, scores):
    return (str(text) + ' {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [11]:
def test_svm(train_size, svm, df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        X_test = scaler.fit_transform(X_test)
        svm.fit(X, y)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, svm.predict_proba(X_test).T[1]))
        accs.append(svm.score(X_test, y_test))
    print(mean_score('roc auc score:', auc_scores))
    print(mean_score('accuracy:', accs))

In [12]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svc = SVC(kernel='rbf', C=1, gamma=0.001, probability=True)
test_svm(0.01, svc, df_clean, 10)

roc auc score: 0.68119558 (+/-0.00031)
accuracy: 0.93310601 (+/-0.00000)


while tweaking the parameters for svm, we noticed an increase of preciscion proportional to an increased number of training samples. However, processing 5% samples, already takes abaout 10 minutes on my machine.

Anyhow, this way of testing is not that efficient at all, we still don't know which params to use, to get a descent result. So we will try a GridSearch. We took the parameter range from a tutorial.

##Grid Search

In [13]:
def svm_grid_search(train_size, svc_params, df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)
        X_train = train.drop('SeriousDlqin2yrs', axis=1)
        y_train = train.SeriousDlqin2yrs 
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        X_test = scaler.fit_transform(X_test)
        gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
        gs_svc.fit(X_train, y_train)
        scv=gs_svc.best_estimator_
        svc.fit(X_train, y_train)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, svc.predict_proba(X_test).T[1]))
        accs.append(svc.score(X_test, y_test))
    print(mean_score('roc auc score:', auc_scores))
    print(mean_score('accuracy:', accs))

In [14]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
    'probability': [True]
}

##Cleaned Data

In [15]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_clean, 10)

roc auc score: 0.68088391 (+/-0.00030)
accuracy: 0.93310601 (+/-0.00000)


##Cropped Data

In [16]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_cropped, 10)

roc auc score: 0.49841584 (+/-0.01949)
accuracy: 0.93050522 (+/-0.00000)


##Oversampled Data

In [17]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_oversampled, 10)

roc auc score: 0.66289212 (+/-0.00000)
accuracy: 0.61334848 (+/-0.00000)


## Undersampled Data

In [18]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv').drop('Unnamed: 0', axis=1)
svm_grid_search(0.01, svc_params, df_undersampled, 10)

roc auc score: 0.72650176 (+/-0.00001)
accuracy: 0.50634576 (+/-0.00000)
