#Support Vector Machine for Credit Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
from pprint import pprint
from print_scores import print_scores

In [2]:
def svm_cv(df, n_iter, train_size):
    smaller = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)[0]
    X = smaller.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = smaller.SeriousDlqin2yrs
    auc_scores = cross_val_score(SVC(kernel='rbf', C=1, gamma=0.001), X, y, scoring='roc_auc', cv=n_iter, n_jobs=-1)
    acc_scores = cross_val_score(SVC(kernel='rbf', C=1, gamma=0.001), X, y, scoring='accuracy', cv=n_iter, n_jobs=-1)
    print print_scores("ROC AUC: " , auc_scores)
    print print_scores("Accuracy: ", acc_scores)    

In [3]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_cv(df_clean, 5, 0.05)

ROC AUC:  0.77698143 (+/-0.02288)
Accuracy:  0.93595398 (+/-0.00089)


while tweaking the parameters for svm, we noticed an increase of preciscion proportional to an increased number of training samples. However, processing 5% samples, already takes abaout 10 minutes on my machine.

Anyhow, this way of testing is not that efficient at all, we still don't know which params to use, to get a descent result. So we will try a GridSearch. We took the parameter range from a tutorial.

##Grid Search

In [4]:
def svm_gridsearch_cv(df, n_iter, train_size, svc_params):
    smaller = sk.cross_validation.train_test_split(df, train_size=train_size, random_state=1)[0]
    X = smaller.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = smaller.SeriousDlqin2yrs
    gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
    gs_svc.fit(X, y)
    svc=gs_svc.best_estimator_
    auc_scores = cross_val_score(svc, X, y, scoring='roc_auc', cv=n_iter, n_jobs=-1)
    acc_scores = cross_val_score(svc, X, y, scoring='accuracy', cv=n_iter, n_jobs=-1)
    print print_scores("ROC AUC: " , auc_scores)
    print print_scores("Accuracy: ", acc_scores)    

In [5]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
    'probability': [True]
}

##Cleaned Data

In [6]:
df_clean = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
svm_gridsearch_cv(df_clean, 10, 0.05, svc_params)

ROC AUC:  0.76570476 (+/-0.04041)
Accuracy:  0.93595460 (+/-0.00191)


##Cropped Data

In [7]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv').drop('Unnamed: 0', axis=1)
svm_gridsearch_cv(df_cropped, 10, 0.05, svc_params)

ROC AUC:  0.68749805 (+/-0.04511)
Accuracy:  0.93775118 (+/-0.00332)


##Oversampled Data

In [8]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv').drop('Unnamed: 0', axis=1)
svm_gridsearch_cv(df_oversampled, 10, 0.05, svc_params)

ROC AUC:  0.85795559 (+/-0.01131)
Accuracy:  0.78555541 (+/-0.01007)


## Undersampled Data

In [12]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv').drop('Unnamed: 0', axis=1)
svm_gridsearch_cv(df_undersampled, 10, 0.05, svc_params)

ROC AUC:  0.85431278 (+/-0.02827)
Accuracy:  0.77054545 (+/-0.03273)
