In [1]:
import numpy as np
from sklearn import svm
import time

In [2]:
train_file = "C:/IITD/sem5/col774-ml/datasets/fmnist_data/fashion_mnist/train.csv"
test_file = "C:/IITD/sem5/col774-ml/datasets/fmnist_data/fashion_mnist/test.csv"
val_file = "C:/IITD/sem5/col774-ml/datasets/fmnist_data/fashion_mnist/val.csv"

train_data = np.genfromtxt(train_file, delimiter=',')
test_data = np.genfromtxt(test_file, delimiter=',')
val_data = np.genfromtxt(val_file, delimiter=',')

In [3]:
    x = train_data[:, :-1]  # features
    x /= x.max() # scale to 0 to 1
    y = train_data[:, -1]  # labels

In [4]:
    x_test = test_data[:, :-1]  # features
    x_test /= x_test.max() # scale to 0 to 1
    y_test = test_data[:, -1]  # labels
    m_test = y_test.size

    x_val = val_data[:, :-1]  # features
    x_val /= x_val.max() # scale to 0 to 1
    y_val = val_data[:, -1]  # labels
    m_val = y_val.size

In [5]:
t0 = time.time()
clf = svm.SVC(C=1, gamma=0.05, kernel='rbf', verbose=True, max_iter=3000)
clf.fit(x, y)
t1 = time.time()
print(t1-t0)

[LibSVM]217.61480712890625


In [7]:
indicator = lambda exp: 1 if exp else 0

In [8]:
# test set accuracy

test_predictions = np.zeros(m_test, np.int)
test_count = 0
for i in range(m_test):
    test_predictions[i] = clf.predict([x_test[i]])
    test_count += indicator(test_predictions[i] == y_test[i])

print("Test set accuracy", (test_count / m_test) * 100)

Test set accuracy 88.08


In [9]:
# validation set accuracy

val_acc = sum(indicator(y_val[i] == clf.predict([x_val[i]])) for i in range(m_val)) / m_val
print(val_acc * 100)

87.92


In [41]:
# confusion matrix
total_classes = 10
confusion_matrix = np.zeros((total_classes, total_classes), np.int)

for i in range(m_test):
    confusion_matrix[test_predictions[i]][int(y_test[i])] += 1

print(confusion_matrix)

[[433   1   5  12   3   0  80   0   1   0]
 [  0 482   0   0   1   0   0   0   0   0]
 [  5   4 411   3  41   0  55   0   1   0]
 [ 11   9   7 457  13   0   9   0   1   0]
 [  3   0  37   9 399   0  34   0   2   0]
 [  0   0   0   0   0 473   0  14   2  11]
 [ 38   4  32  14  38   0 315   0   2   0]
 [  0   0   0   0   0  16   0 471   2  14]
 [ 10   0   8   5   5   5   7   1 489   1]
 [  0   0   0   0   0   6   0  14   0 474]]


In [7]:
# K-fold cross validation
from sklearn.model_selection import KFold
kf = KFold(5)

for C in [1e-3, 1, 5]:
    print("C", C)
    for train, val in kf.split(x):
        print("%s %s" % (train, val))
        x_train, x_val, y_train, y_val = x[train], x[val], y[train], y[val]
        clf = svm.SVC(C=C, gamma=0.05, kernel='rbf', verbose=True, max_iter=200)
        clf.fit(x_train, y_train)
        val_acc = sum(indicator(y_val[i] == clf.predict([x_val[i]])) for i in range(len(y_val))) / len(y_val)
        print("acc", C, val_acc)

C 0.001
[ 4500  4501  4502 ... 22497 22498 22499] [   0    1    2 ... 4497 4498 4499]
[LibSVM]acc 0.001 0.6804444444444444
[    0     1     2 ... 22497 22498 22499] [4500 4501 4502 ... 8997 8998 8999]
[LibSVM]acc 0.001 0.6466666666666666
[    0     1     2 ... 22497 22498 22499] [ 9000  9001  9002 ... 13497 13498 13499]
[LibSVM]