In [95]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hwutils as util
import tqdm

from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from collections import namedtuple, defaultdict

%matplotlib inline

In [2]:
import importlib
importlib.reload(util)

<module 'hwutils' from '/home/elama/Projects/cs1156x/hwutils.py'>

In [21]:
train_data_path = './data/features.train'
test_data_path = './data/features.test'

def read_data(path):
    return pd.read_csv(path, delimiter='\s+', header=None,
                       names=['digit', 'intensity', 'symmetry'])

digits_train = read_data(train_data_path)
digits_test = read_data(test_data_path)

features = ['intensity', 'symmetry']

## SVM with Soft Margins

### Polynomial Kernels

In [23]:
labels = [str(d) + '-vs-all' for d in range(10)]

for df in [digits_train, digits_test]:
    for digit in range(10):
        df[labels[digit]] = (df['digit'] == digit).apply(lambda x: 1 if x else -1)
    df['1-vs-5'] = df['digit'].apply(lambda x: {1: 1, 5: -1}.get(x))        

In [36]:
svc = SVC(kernel='poly', degree=2, C=.01)

X = digits_train[features]
print('In-sample error')
for digit in range(10)[1::2]:
    y = digits_train[labels[digit]]
    svc.fit(X, y)
    Ein = sum(svc.predict(X) != y) / y.size
    print('{}: {:.3f}'.format(labels[digit], Ein))

In-sample error
1-vs-all: 0.016
3-vs-all: 0.090
5-vs-all: 0.076
7-vs-all: 0.088
9-vs-all: 0.088


In [43]:
svc.fit(X, digits_train['0-vs-all']); n_sv_0vsall = svc.n_support_.sum()
svc.fit(X, digits_train['1-vs-all']); n_sv_1vsall = svc.n_support_.sum()
n_sv_0vsall - n_sv_1vsall

1854

In [75]:
train_idx = digits_train['1-vs-5'].notnull()
test_idx = digits_test['1-vs-5'].notnull()

X_train = digits_train.loc[train_idx, features]
y_train = digits_train.loc[train_idx, '1-vs-5']
X_test = digits_test.loc[test_idx, features]
y_test = digits_test.loc[test_idx, '1-vs-5']

result = []
for Q, C in itertools.product([2, 5], [.001, .01, .1, 1.]):
    svc = SVC(kernel='poly', degree=Q, C=C, gamma=1, coef0=1)
    svc.fit(X_train, y_train)    
    result.append((Q, C, svc.n_support_.sum(),
                   sum(svc.predict(X_train) != y_train) / y_train.size,
                   sum(svc.predict(X_test) != y_test) / y_test.size))

result = pd.DataFrame(data=result, columns=['Q', 'C', 'n_sv', 'Ein', 'Eout'])

In [83]:
result

Unnamed: 0,Q,C,n_sv,Ein,Eout
0,2,0.001,76,0.004484,0.016509
1,2,0.01,34,0.004484,0.018868
2,2,0.1,24,0.004484,0.018868
3,2,1.0,24,0.003203,0.018868
4,5,0.001,25,0.004484,0.021226
5,5,0.01,23,0.003844,0.021226
6,5,0.1,25,0.003203,0.018868
7,5,1.0,21,0.003203,0.021226


### Cross Validation

In [111]:
idx = digits_train['1-vs-5'].notnull()
X = digits_train.loc[idx, features].values
y = digits_train.loc[idx, '1-vs-5'].values

total_results = defaultdict(int)
for run in tqdm.tqdm(range(100)):
    skf = StratifiedKFold(y, n_folds=10, shuffle=True)
    
    run_results = []
    for C in [0.0001, 0.001, 0.01, 0.1, 1]:
        svc = SVC(kernel='poly', degree=2, gamma=1, coef0=1, C=C)
        
        err_val = []
        for train_idx, val_idx in skf:
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]            
            svc.fit(X_train, y_train)
            e = sum(svc.predict(X_val) != y_val) / y_val.size            
            err_val.append(e)
        
        mean_err_val = np.mean(err_val)        
        run_results.append((C, mean_err_val))
    
    run_results = pd.DataFrame(data=run_results, columns=['C', 'Ecv'])
    selected_C = run_results.sort_values(by=['Ecv', 'C'], ascending=[True, True]).iloc[0]['C']
    
    total_results[selected_C] += 1

total_results

100%|██████████| 100/100 [00:21<00:00,  4.62it/s]


defaultdict(int, {0.001: 55, 0.01: 17, 0.10000000000000001: 13, 1.0: 15})