In [6]:
import numpy as np
arr = np.genfromtxt('data/chips.csv', delimiter=',')
xs, ys = arr[:,:2].copy(), np.int32(arr[:, 2:].ravel())
ys = 2 * ys - 1

In [7]:
def f1_score(predicted, true):
    tp = ((predicted == true) * (predicted == 1)).sum()
    tn = ((predicted == true) * (predicted == -1)).sum()
    fp = ((predicted != true) * (predicted == 1)).sum()
    fn = ((predicted != true) * (predicted == -1)).sum()
    p = tp*1.0/max(1, tp+fp)
    r = tp*1.0/max(1, tp+fn)
    return (2*p*r/max(1, p+r), tp, fn, fp, tn)

In [22]:
def shuffle(X, y):
    p = np.arange(0, X.shape[0])
    np.random.shuffle(p)
    return (X[p], y[p])
xs, ys = shuffle(xs, ys)

In [9]:
def split(X, y, part=10):
    p = X.shape[0] // part
    return X[p:], y[p:], X[:p], y[:p]

In [10]:
from sklearn.cross_validation import KFold
def kf_cross_validation(regressor, xs, ys, n_fold=10, times=1, **params):
    measure = lambda x, y: f1_score(x, y)[0]
    res = 0
    for _ in range(times):
        kf = KFold(len(xs), n_fold, True, 0)
        kf_sum = 0
        fold_num = 0
        for train_i, test_i in kf:
            regressor.fit(xs[train_i], ys[train_i], **params)
            predicted = regressor.predict(xs[test_i])
            meas = measure(predicted, ys[test_i])
            fold_num += 1
            kf_sum += meas
            print(ys[test_i])
            print("Time {}, fold {} done, measure = {}".format(_, fold_num, meas))
        res += kf_sum/n_fold
    return res/times

In [11]:
class SVM_SGD:
    def __init__(self, phi=None):
        self.phi = phi
        
    def fit(self, X, Y, C, lr=0.01, eps=1e-5, iters=1000):#K(x1, x2)=<phi(x1), phi(x2)>
        if self.phi is not None:
            X = np.apply_along_axis(self.phi, 1, X)
        teta = 1.0 / (2 * C)
        n = X.shape[0]
        dim = X.shape[1]

        last_obj= -np.inf
        t = lr
        lr = 1.

        #init value
        w = np.zeros(dim)
        w0 = 0
        for it in range(iters):
            for x,y in zip(X, Y):
                margin = y * (np.dot(x, w) - w0)
                sl = np.maximum(1 - margin, 0)
                w = w - lr * (-sl * y * x + 2 * teta * w)
                w0 = w0 - lr * (sl * y)
            margin = (np.dot(X, w) - w0) * Y
            cur_obj = np.maximum(1 - margin, 0).sum() + teta * np.dot(w, w)
            if abs(last_obj - cur_obj) < eps:
                break
            last_obj = cur_obj

            lr = lr*(1+lr*t*it)**-1
        self.w, self.w0 = w, w0
        #print("w = {}, w0 = {}".format(w, w0))
    
    def predict(self, x):
        if self.phi is not None:
            x = np.apply_along_axis(self.phi, 1, x)
        return np.ndarray.astype(np.sign(np.dot(x, self.w) - self.w0), np.int32)

In [12]:
def print_f1Nmatrix(t):
    print("confusion matrix")
    print(t[1], t[2])
    print(t[3], t[4])
    print("f1_score = ", t[0])

In [13]:
print("No kernel, C = 100")
svm = SVM_SGD()
svm.fit(xs, ys, 100)
pr = svm.predict(xs)
print_f1Nmatrix(f1_score(pr, ys))

No kernel, C = 100
confusion matrix
30 28
26 34
f1_score =  0.526315789474


In [23]:
print("phi (x, y, x**2+y**2)")
svm = SVM_SGD(lambda x: np.array([x[0], x[1], x[0]**2+x[1]**2]))
for C in [1, 10, 100, 1000, 10000]:
    print("C =", C)
    print("f1_score =", kf_cross_validation(svm, xs, ys, C=C))
    print("--------------------------------------------------")

phi (x, y, x**2+y**2)
C = 1
[ 1 -1 -1  1  1  1  1  1  1 -1 -1 -1]
Time 0, fold 1 done, measure = 0.0
[ 1 -1  1  1  1  1 -1 -1  1  1  1 -1]
Time 0, fold 2 done, measure = 0.0
[-1  1  1  1  1 -1 -1  1 -1  1  1  1]
Time 0, fold 3 done, measure = 0.0
[ 1  1 -1  1 -1 -1 -1  1 -1 -1  1 -1]
Time 0, fold 4 done, measure = 0.0
[ 1  1  1 -1 -1  1  1 -1 -1  1 -1  1]
Time 0, fold 5 done, measure = 0.0
[-1 -1 -1 -1  1  1  1  1  1 -1 -1 -1]
Time 0, fold 6 done, measure = 0.0
[-1  1  1  1  1 -1 -1 -1  1 -1  1 -1]
Time 0, fold 7 done, measure = 0.0
[ 1 -1 -1 -1 -1 -1  1  1 -1 -1  1 -1]
Time 0, fold 8 done, measure = 0.0
[ 1 -1  1 -1 -1  1 -1  1 -1  1 -1]
Time 0, fold 9 done, measure = 0.0
[-1 -1 -1 -1  1 -1 -1  1  1 -1 -1]
Time 0, fold 10 done, measure = 0.0
f1_score = 0.0
--------------------------------------------------
C = 10
[ 1 -1 -1  1  1  1  1  1  1 -1 -1 -1]
Time 0, fold 1 done, measure = 0.7692307692307692
[ 1 -1  1  1  1  1 -1 -1  1  1  1 -1]
Time 0, fold 2 done, measure = 0.799999999999999

In [26]:
print("Full dataset")
svm.fit(xs, ys, 1000)
prediction_svm = svm.predict(xs)
print_f1Nmatrix(f1_score(prediction_svm, ys))
print("--------------------------------------------------")

Full dataset
confusion matrix
46 12
11 49
f1_score =  0.8
--------------------------------------------------


In [27]:
import knn
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def wilcoxon(x, y):
    assert len(x) == len(y)
    d = x - y
    d = np.compress(np.not_equal(d, 0), d, axis=-1)
    count = len(d)
    assert count >= 10    
    r = np.sort(abs(d))
    nr = len(r)
    for i in range(0, nr):
        j = i
        while j < nr and r[i] == r[j]:
            j += 1
        r[i:j] = (j*(j-1)/2 - i*(i-1)/2) / (j - i)
    W = sum(r * np.sign(d))
    z_score = W / np.sqrt(nr*(nr+1)*(2*nr+1)/6)
    return W, z_score

def weight1(x):
    return x/x

print("Wilcoxon")
X_train, y_train, X_test, y_test = xs, ys, xs, ys #split(xs, ys, 10)
scaler=StandardScaler(copy=True, with_mean=True, with_std=True)
prediction_knn = knn.batchKnn(X_train, y_train, 3, weight_f=weight1, metric_f='l2')
print(wilcoxon(prediction_knn, prediction_svm))

Wilcoxon
(106, 1.5142857142857142)
