In [67]:
import numpy as np
from IPython.display import display, clear_output
arr = np.genfromtxt('data/chips.csv', delimiter=',')
xs, ys = arr[:,:2].copy(), np.int32(arr[:, 2:].ravel())
ys = 2 * ys - 1

In [68]:
def f1_score(predicted, true):
    tp = ((predicted == true) * (predicted == 1)).sum()
    tn = ((predicted == true) * (predicted == -1)).sum()
    fp = ((predicted != true) * (predicted == 1)).sum()
    fn = ((predicted != true) * (predicted == -1)).sum()
    p = tp*1.0/max(1, tp+fp)
    r = tp*1.0/max(1, tp+fn)
    return (2*p*r/max(1, p+r), tp, fn, fp, tn)

In [69]:
def shuffle(X, y):
    p = np.arange(0, X.shape[0])
    np.random.shuffle(p)
    return (X[p], y[p])
xs, ys = shuffle(xs, ys)

In [70]:
def split(X, y, part=10):
    p = X.shape[0] // part
    return X[p:], y[p:], X[:p], y[:p]

In [85]:
from sklearn.cross_validation import KFold
def kf_cross_validation(regressor, xs, ys, n_fold=10, kf=None, **params):
    measure = lambda x, y: f1_score(x, y)[0]
    kf_sum = 0
    f1_scores = []
    if kf is None:
        kf = KFold(len(xs), n_fold, True, 0)
    fold_num = 0
    for train_i, test_i in kf:
        regressor.fit(xs[train_i], ys[train_i], **params)
        predicted = regressor.predict(xs[test_i])
        meas = measure(predicted, ys[test_i])
        f1_scores.append(meas)
        kf_sum += meas
        fold_num += 1
        print("Fold {} done, measure = {}".format(fold_num, meas))
    return (kf_sum/len(kf), np.array(f1_scores)*100)

In [72]:
class SVM_SGD:
    def __init__(self, phi=None):
        self.phi = phi
        
    def fit(self, X, Y, C, lr=0.01, eps=1e-5, iters=1000):#K(x1, x2)=<phi(x1), phi(x2)>
        if self.phi is not None:
            X = np.apply_along_axis(self.phi, 1, X)
        teta = 1.0 / (2 * C)
        n = X.shape[0]
        dim = X.shape[1]

        last_obj= -np.inf
        t = lr
        lr = 1.

        #init value
        w = np.zeros(dim)
        w0 = 0
        for it in range(iters):
            for x,y in zip(X, Y):
                margin = y * (np.dot(x, w) - w0)
                sl = np.maximum(1 - margin, 0)
                w = w - lr * (-sl * y * x + 2 * teta * w)
                w0 = w0 - lr * (sl * y)
            margin = (np.dot(X, w) - w0) * Y
            cur_obj = np.maximum(1 - margin, 0).sum() + teta * np.dot(w, w)
            if abs(last_obj - cur_obj) < eps:
                break
            last_obj = cur_obj

            lr = lr*(1+lr*t*it)**-1
        self.w, self.w0 = w, w0
    
    def predict(self, x):
        if self.phi is not None:
            x = np.apply_along_axis(self.phi, 1, x)
        return np.ndarray.astype(np.sign(np.dot(x, self.w) - self.w0), np.int32)

In [73]:
def print_f1Nmatrix(t):
    print("confusion matrix")
    print(t[1], t[2])
    print(t[3], t[4])
    print("f1_score = ", t[0])

In [74]:
print("No kernel, C = 100")
svm = SVM_SGD()
svm.fit(xs, ys, 100)
pr = svm.predict(xs)
print_f1Nmatrix(f1_score(pr, ys))

No kernel, C = 100
confusion matrix
29 29
25 35
f1_score =  0.517857142857


In [75]:
print("phi (x, y, x**2+y**2, 2xy)")
svm = SVM_SGD(lambda x: np.array([x[0], x[1], x[0]**2+x[1]**2, x[0]**2+2*x[0]*x[1]+x[1]**2]))
for C in [2, 10, 100, 1000]:
    print("C =", C)
    print("f1_score =", kf_cross_validation(svm, xs, ys, C=C)[0])
    print("--------------------------------------------------")

phi (x, y, x**2+y**2, 2xy)
C = 2
Fold 1 done, measure = 0.0
Fold 2 done, measure = 0.6666666666666666
Fold 3 done, measure = 0.4
Fold 4 done, measure = 0.0
Fold 5 done, measure = 0.0
Fold 6 done, measure = 0.0
Fold 7 done, measure = 0.0
Fold 8 done, measure = 0.0
Fold 9 done, measure = 0.0
Fold 10 done, measure = 0.0
f1_score = 0.106666666667
--------------------------------------------------
C = 10
Fold 1 done, measure = 0.823529411764706
Fold 2 done, measure = 0.7142857142857143
Fold 3 done, measure = 0.6666666666666666
Fold 4 done, measure = 0.888888888888889
Fold 5 done, measure = 0.8750000000000001
Fold 6 done, measure = 0.923076923076923
Fold 7 done, measure = 0.9333333333333333
Fold 8 done, measure = 0.7692307692307692
Fold 9 done, measure = 0.8571428571428571
Fold 10 done, measure = 0.5714285714285715
f1_score = 0.802258313582
--------------------------------------------------
C = 100
Fold 1 done, measure = 0.8750000000000001
Fold 2 done, measure = 0.8
Fold 3 done, measure = 0.

In [76]:
print("Full dataset")
svm.fit(xs, ys, 1000)
prediction_svm = svm.predict(xs)
print_f1Nmatrix(f1_score(prediction_svm, ys))
print("--------------------------------------------------")

Full dataset
confusion matrix
48 10
9 51
f1_score =  0.834782608696
--------------------------------------------------


In [86]:
def wilcoxon(x, y):
    assert len(x) == len(y)
    d = x - y
    d = np.compress(np.not_equal(d, 0), d, axis=-1)
    count = len(d)  
    r = np.sort(abs(d))
    nr = len(r)
    for i in range(0, nr):
        j = i
        while j < nr and r[i] == r[j]:
            j += 1
        r[i:j] = (j*(j-1)/2 - i*(i-1)/2) / (j - i)
    W = sum(r * np.sign(d))
    z_score = W / np.sqrt(nr*(nr+1)*(2*nr+1)/6)
    return W, z_score

def weight1(x):
    return x/x

In [90]:
import knn
n_fold = 7
kf = KFold(len(xs), n_fold, True, 0)

knn = knn.KNN()
f1_knn, f1_scoresKnn = kf_cross_validation(knn, xs, ys, kf=kf, k=3, weight_f=weight1, metric_f='l2')
print("KNN f1_score:", f1_knn)
f1_svm, f1_scoresSvm = kf_cross_validation(svm, xs, ys, kf=kf, C=1000)
print("SVM f1_score:", f1_svm)

Fold 1 done, measure = 0.8571428571428572
Fold 2 done, measure = 0.631578947368421
Fold 3 done, measure = 0.6666666666666666
Fold 4 done, measure = 0.9090909090909091
Fold 5 done, measure = 0.8
Fold 6 done, measure = 0.625
Fold 7 done, measure = 0.5454545454545454
KNN f1_score: 0.719276275103
Fold 1 done, measure = 0.9
Fold 2 done, measure = 0.7777777777777778
Fold 3 done, measure = 0.75
Fold 4 done, measure = 0.9523809523809523
Fold 5 done, measure = 0.823529411764706
Fold 6 done, measure = 0.7999999999999999
Fold 7 done, measure = 0.8
SVM f1_score: 0.829098305989


In [91]:
print("Wilcoxon")
wilcoxon(f1_scoresSvm, f1_scoresKnn)

Wilcoxon


(21.0, 1.7748239349298849)

In [92]:
pValueTable = [
[0, 0.995, 0.975, 0.20, 0.10, 0.05, 0.025, 0.02, 0.01, 0.005, 0.002, 0.001],
[1, 0.0000393, 0.000982, 1.642, 2.706, 3.841, 5.024, 5.412, 6.635, 7.879, 9.550, 10.828],
[2, 0.0100, 0.0506, 3.219, 4.605, 5.991, 7.378, 7.824, 9.210, 10.597, 12.429, 13.816],
[3, 0.0717, 0.216, 4.642, 6.251, 7.815, 9.348, 9.837, 11.345, 12.838, 14.796, 16.266],
[4, 0.207, 0.484, 5.989, 7.779, 9.488, 11.143, 11.668, 13.277, 14.860, 16.924, 18.467],
[5, 0.412, 0.831, 7.289, 9.236, 11.070, 12.833, 13.388, 15.086, 16.750, 18.907, 20.515],
[6, 0.676, 1.237, 8.558, 10.645, 12.592, 14.449, 15.033, 16.812, 18.548, 20.791, 22.458],
[7, 0.989, 1.690, 9.803, 12.017, 14.067, 16.013, 16.622, 18.475, 20.278, 22.601, 24.322],
[8, 1.344, 2.180, 11.030, 13.362, 15.507, 17.535, 18.168, 20.090, 21.955, 24.352, 26.124],
[9, 1.735, 2.700, 12.242, 14.684, 16.919, 19.023, 19.679, 21.666, 23.589, 26.056, 27.877],
[10, 2.156, 3.247, 13.442, 15.987, 18.307, 20.483, 21.161, 23.209, 25.188, 27.722, 29.588]
]

def chiSquare(exp, obs):
    x2 = ((obs-exp)**2/exp).sum()
    print("x^2 =", x2)
    r = len(exp) - 1
    j = 1
    while x2 > pValueTable[r][j]:
        j += 1
    return pValueTable[0][j]
print("pValue")
print(chiSquare(f1_scoresKnn, f1_scoresSvm))

pValue
x^2 = 21.6943196607
0.001
