In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

In [2]:
# Load the dataset
train = pd.read_csv('./arcene_train.data',header=None,sep=' ')
del train[10000] ## Some read problem wasr=there
train=train.as_matrix() ## Convert to numpy array

In [3]:
## Read Lables
lines = open('./arcene_train.labels').read().splitlines()
train_lables=lines

In [4]:
########## HELPER FUNCTIONS
def rbfkernel(gamma, distance):
    return np.exp(-gamma * distance)

def inverse_squareform(matrix):
    inv_sqfrm = []
    for i in range(len(matrix)):
        for j in range(i+1, len(matrix[i])):
            inv_sqfrm.append(matrix[i][j])
    inv_sqfrm = np.array(inv_sqfrm)
    return inv_sqfrm

def find_distance_matrix(data):
    euclid_distance = []
    for i in data:
        distance = []
        for j in data:
            distance.append(np.linalg.norm(i - j) * np.linalg.norm(i - j))
        distance = np.array(distance)
        euclid_distance.append(distance)
    euclid_distance = np.array(euclid_distance)
    return euclid_distance

In [5]:
# ## Build POLYNOMIAL KERNEL Function
# K=np.zeros(shape=(100,100))
# for i in range(0,len(train)):
#     for j in range(0,len(train)):
#         k = 1+np.dot(train[i].T,train[j]) ## K(i,j) = ( 1 + x(i).T . x(j) )^p
#         k=math.pow(k,2)
#         K[i][j]=k

## Build Kernel Matrix
# calculate euclidean distance matrix
distance_matrix = find_distance_matrix(train)

# find variance of one dimensional distance list
variance = np.var(inverse_squareform(distance_matrix))

# calculate kernel (using rbfkernel)
gamma = 1/(2*variance)
K = rbfkernel(gamma, distance_matrix)

In [6]:
## Split train data according to classes
#Add class lables to data
df=pd.DataFrame(train)
tmp=pd.Series(train_lables)
df.loc[:,10000]=tmp
#Now split as per lables
df0=df[df[10000]=='-1'] 
df1=df[df[10000]=='1']  

## Remove class lables now
del df0[10000] ## DF containing class -1 
del df1[10000] ## DF containing class +1
## convert to numpy arrays
class0=df0.as_matrix()
class1=df1.as_matrix()

In [7]:
# calculate indexes of data points of two class
index1 = []
index2 = []
for i in range(len(train_lables)):
    if train_lables[i] == '-1':
        index1.append(i)
    else:
        index2.append(i)
    
# calculate class based K1 and K2 for calculation of N (Within class scatter)
K1 = []
K2 = []
for i in K:
    temp1 = []
    temp2 = []
    for j in index1:
        temp1.append(i[j])
    for j in index2:
        temp2.append(i[j])
    K1.append(np.array(temp1))
    K2.append(np.array(temp2))

K1 = np.array(K1)
K2 = np.array(K2)

l1=len(class0)
l2=len(class1)
    
# calculate A = I - 1lj for calc of N 
A1 = np.identity(l1) - ((1/float(l1)) * np.ones((l1, l1)))
A2 = np.identity(l2) - ((1/float(l2)) * np.ones((l2, l2)))

# calculate within class scatter matrix N
N1 = np.dot(A1, K1.T)
N1 = np.dot(K1, N1)

N2 = np.dot(A2, K2.T)
N2 = np.dot(K2, N2)

N = N1 + N2

# calculate N inverse for alpha calculation
N_inv = np.linalg.inv(N)

# calculate M1 and M2
M1 = []
M2 = []
for i in range(len(K1)):
    M1.append(np.sum(K1[i])/float(l1))
for i in range(len(K2)):
    M2.append(np.sum(K2[i])/float(l2))
M1 = np.array(M1)
M2 = np.array(M2)

# calculating alpha
M_diff = M2 - M1
alpha = np.dot(N_inv, M_diff)

### Calculate Projection
Y = []
for i in K:
    temp = 0
    for j in range(len(i)):
        temp += alpha[j] * i[j]
    Y.append(temp)
Y = np.array(Y)

In [8]:
import matplotlib.pyplot as plt
# class1 = plt.plot([Data[i] for i in range(len(Data)) if labels[i] == -1], 'ro')
# class2 = plt.plot([Data[i] for i in range(len(Data)) if labels[i] == 1], 'ro')
# plt.setp(class1, color='red')
# plt.setp(class2, color='blue')

C1 = [Y[i] for i in range(len(Y)) if train_lables[i] == '-1']
C2 = [Y[i] for i in range(len(Y)) if train_lables[i] == '1']
plt.scatter(C1, len(C1)*[0], color ='red')
plt.scatter(C2, len(C2)*[0], color ='blue')
# plt.scatter(Y, len(Y)*[0], color ='red') ## One colour
plt.show()

In [9]:
######### NOW DO FOR TEST DATA ################
## Load the dataset
test = pd.read_csv('./arcene_valid.data',header=None,sep=' ')
del test[10000] ## Some read problem wasthere
test=test.as_matrix() ## Convert to numpy array

## Read Lables
lines = open('./arcene_valid.labels').read().splitlines()
test_lables=lines


In [10]:
# Ktest=np.zeros(shape=(100,100))
# for i in range(0,len(test)):
#     for j in range(0,len(train)):
#         ktest = 1+np.dot(test[i].T,train[j]) ## K(i,j) = ( 1 + x(i).T . x(j) )^p
#         ktest=math.pow(ktest,2)
#         Ktest[i][j]=ktest

## By RBF Kernel
Ktest= []
for i in test:
        dist = np.array([np.sum((i - row)**2) for row in train])
        k = np.exp(-gamma * dist)
        Ktest.append(k)

Ytest = []
for i in Ktest:
        temp = 0
        for j in range(len(i)):
            temp += alpha[j] * i[j]
        Ytest.append(temp)
Ytest= np.array(Ytest)

In [11]:
import matplotlib.pyplot as plt

C1 = [Ytest[i] for i in range(len(Ytest)) if test_lables[i] == '-1']
C2 = [Ytest[i] for i in range(len(Ytest)) if test_lables[i] == '1']
plt.scatter(C1, len(C1)*[0], color ='red')
plt.scatter(C2, len(C2)*[0], color ='blue')
# plt.scatter(Ytest, len(Y)*[0], color ='red') ## One colour
plt.show()

In [10]:
############# APPLYING SVM ####################
from sklearn import svm
clf = svm.SVC(kernel='rbf',gamma=0.01) ## use linear too
# clf = svm.SVC(kernel='linear') ## use linear too
clf.fit( Y.reshape(100,1),train_lables) ## Y is the transformed data
clf.score(Ytest.reshape(100,1),test_lables) ## Ytest is transformed test data

0.56000000000000005