#Support Vector Machine for Credit Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk
from pprint import pprint

In [2]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.3f} (+/-{1:.3f})').format(np.mean(scores), sem(scores))

In [3]:
df = pd.read_csv('data/cs-train_clean.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'NumberOfDependents'], axis=1)
X = df.drop('SeriousDlqin2yrs', axis=1)
X = sk.preprocessing.scale(X)
y = df.SeriousDlqin2yrs

In [4]:
svc = SVC(kernel='rbf', C=1, gamma=0.001)
cv_svm = ShuffleSplit(10000, n_iter=10, test_size=0.25, random_state=0)

In [5]:
test_scores_svm = cross_val_score(svc, X, y, cv=cv_svm, n_jobs=-1)
test_scores_svm
print mean_score(test_scores_svm)

Mean score: 0.937 (+/-0.001)


while tweaking the parameters for svm, I noticed an increase of preciscion proportional to an increased number of samples. However, processing 10000 samples, already takes abaout 10 minutes on my machine.

##Grid Search

In [6]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)


In [7]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
}

In [8]:
pprint(svc_params)

{'C': array([   0.1,    1. ,   10. ,  100. ]),
 'gamma': array([  1.00000000e-04,   1.00000000e-03,   1.00000000e-02,
         1.00000000e-01,   1.00000000e+00])}


In [9]:
X_train = train.drop('SeriousDlqin2yrs', axis=1)
y_train = train.SeriousDlqin2yrs
n_subsamples = 10000
X_small_train, y_small_train = X_train[:n_subsamples], y_train[:n_subsamples] 

In [10]:
gs_svc = GridSearchCV(SVC(), svc_params, cv = 3, n_jobs=-1)
%time gs_svc.fit(X_small_train, y_small_train)

CPU times: user 2.92 s, sys: 212 ms, total: 3.13 s
Wall time: 47.5 s


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'C': array([   0.1,    1. ,   10. ,  100. ]), 'gamma': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [11]:
gs_svc.best_params_, gs_svc.best_score_

({'C': 1.0, 'gamma': 0.001}, 0.93379999999999996)

In [12]:
X_test = test.drop('SeriousDlqin2yrs', axis=1)
y_test = test.SeriousDlqin2yrs

In [13]:
gs_svc.score(X_test, y_test)

0.93368806652584291

In [16]:
gs_svc = SVC(kernel='rbf', C=1.0, gamma=0.0001)
gs_cv_svm = ShuffleSplit(10000, n_iter=10, test_size=0.25, random_state=0)

In [17]:
test_scores_svm = cross_val_score(gs_svc, X, y, cv=gs_cv_svm, n_jobs=-1)
test_scores_svm
print mean_score(test_scores_svm)

Mean score: 0.937 (+/-0.001)
