In [1]:
import numpy as np
from IPython.display import display, clear_output
arr = np.genfromtxt('data/chips.csv', delimiter=',')
xs, ys = arr[:,:2].copy(), np.int32(arr[:, 2:].ravel())
ys = 2 * ys - 1

In [2]:
def f1_score(predicted, true):
    tp = ((predicted == true) * (predicted == 1)).sum()
    tn = ((predicted == true) * (predicted == -1)).sum()
    fp = ((predicted != true) * (predicted == 1)).sum()
    fn = ((predicted != true) * (predicted == -1)).sum()
    p = tp*1.0/max(1, tp+fp)
    r = tp*1.0/max(1, tp+fn)
    return (2*p*r/max(1, p+r), tp, fn, fp, tn)

In [3]:
def shuffle(X, y):
    p = np.arange(0, X.shape[0])
    np.random.shuffle(p)
    return (X[p], y[p])
xs, ys = shuffle(xs, ys)

In [4]:
def split(X, y, part=10):
    p = X.shape[0] // part
    return X[p:], y[p:], X[:p], y[:p]

In [5]:
from sklearn.cross_validation import KFold
def kf_cross_validation(regressor, xs, ys, n_fold=10, kf=None, **params):
    measure = lambda x, y: f1_score(x, y)[0]
    kf_sum = 0
    f1_scores = []
    if kf is None:
        kf = KFold(len(xs), n_fold, True, 0)
    fold_num = 0
    for train_i, test_i in kf:
        regressor.fit(xs[train_i], ys[train_i], **params)
        predicted = regressor.predict(xs[test_i])
        meas = measure(predicted, ys[test_i])
        f1_scores.append(meas)
        kf_sum += meas
        fold_num += 1
        print("Fold {} done, measure = {}".format(fold_num, meas))
    return (kf_sum/len(kf), np.array(f1_scores))

In [6]:
class SVM_SGD:
    def __init__(self, phi=None):
        self.phi = phi
        
    def fit(self, X, Y, C, lr=0.01, eps=1e-5, iters=1000):#K(x1, x2)=<phi(x1), phi(x2)>
        if self.phi is not None:
            X = np.apply_along_axis(self.phi, 1, X)
        teta = 1.0 / (2 * C)
        n = X.shape[0]
        dim = X.shape[1]

        last_obj= -np.inf
        t = lr
        lr = 1.

        #init value
        w = np.zeros(dim)
        w0 = 0
        for it in range(iters):
            for x,y in zip(X, Y):
                margin = y * (np.dot(x, w) - w0)
                sl = np.maximum(1 - margin, 0)
                w = w - lr * (-sl * y * x + 2 * teta * w)
                w0 = w0 - lr * (sl * y)
            margin = (np.dot(X, w) - w0) * Y
            cur_obj = np.maximum(1 - margin, 0).sum() + teta * np.dot(w, w)
            if abs(last_obj - cur_obj) < eps:
                break
            last_obj = cur_obj

            lr = lr*(1+lr*t*it)**-1
        self.w, self.w0 = w, w0
    
    def predict(self, x):
        if self.phi is not None:
            x = np.apply_along_axis(self.phi, 1, x)
        return np.ndarray.astype(np.sign(np.dot(x, self.w) - self.w0), np.int32)

In [7]:
def print_f1Nmatrix(t):
    print("confusion matrix")
    print(t[1], t[2])
    print(t[3], t[4])
    print("f1_score = ", t[0])

In [8]:
print("No kernel, C = 100")
svm = SVM_SGD()
svm.fit(xs, ys, 100)
pr = svm.predict(xs)
print_f1Nmatrix(f1_score(pr, ys))

No kernel, C = 100
confusion matrix
29 29
25 35
f1_score =  0.517857142857


In [19]:
print("phi (x, y, x**2+y**2, 2xy)")
svm = SVM_SGD(lambda x: np.array([x[0], x[1], x[0]**2+x[1]**2, x[0]**2+2*x[0]*x[1]+x[1]**2]))
for C in [2, 10, 100, 1000]:
    print("C =", C)
    print("f1_score =", kf_cross_validation(svm, xs, ys, C=C)[0])
    print("--------------------------------------------------")

phi (x, y, x**2+y**2, 2xy)
C = 2
Fold 1 done, measure = 0.6666666666666666
Fold 2 done, measure = 0.6666666666666666
Fold 3 done, measure = 0.5882352941176471
Fold 4 done, measure = 0.5882352941176471
Fold 5 done, measure = 0.6666666666666666
Fold 6 done, measure = 0.8
Fold 7 done, measure = 0.5882352941176471
Fold 8 done, measure = 0.5882352941176471
Fold 9 done, measure = 0.625
Fold 10 done, measure = 0.7777777777777778
f1_score = 0.655571895425
--------------------------------------------------
C = 10
Fold 1 done, measure = 1.0
Fold 2 done, measure = 0.8571428571428571
Fold 3 done, measure = 0.7272727272727272
Fold 4 done, measure = 0.888888888888889
Fold 5 done, measure = 0.8333333333333334
Fold 6 done, measure = 0.8571428571428571
Fold 7 done, measure = 0.9090909090909091
Fold 8 done, measure = 0.7142857142857143
Fold 9 done, measure = 0.6153846153846154
Fold 10 done, measure = 0.7142857142857143
f1_score = 0.811682761683
--------------------------------------------------
C = 100


In [10]:
print("Full dataset")
svm.fit(xs, ys, 1000)
prediction_svm = svm.predict(xs)
print_f1Nmatrix(f1_score(prediction_svm, ys))
print("--------------------------------------------------")

Full dataset
confusion matrix
46 12
9 51
f1_score =  0.814159292035
--------------------------------------------------


In [17]:
def wilcoxon(x, y):
    assert len(x) == len(y)
    d = x - y
    d = np.compress(np.not_equal(d, 0), d, axis=-1)
    count = len(d)  
    r = np.sort(abs(d))
    nr = len(r)
    for i in range(0, nr):
        j = i
        while j < nr and r[i] == r[j]:
            j += 1
        r[i:j] = (j*(j-1)/2 - i*(i-1)/2) / (j - i)
    W = sum(r * np.sign(d))
    z_score = W / np.sqrt(nr*(nr+1)*(2*nr+1)/6)
    return W, z_score

def weight1(x):
    return x/x

In [20]:
print("Compare KNN and SVM")
import knn
n_fold = 10
kf = KFold(len(xs), n_fold, True, 0)

knn = knn.KNN()
f1_knn, f1_scoresKnn = kf_cross_validation(knn, xs, ys, kf=kf, k=3, weight_f=weight1, metric_f='l2')
print("KNN f1_score:", f1_knn)
f1_svm, f1_scoresSvm = kf_cross_validation(svm, xs, ys, kf=kf, C=1000)
print("SVM f1_score:", f1_svm)
clear_output()

In [21]:
print("Wilcoxon")
wilcoxon(f1_scoresSvm, f1_scoresKnn)

Wilcoxon


(9.0, 0.94345635304972653)