In [1]:

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale

letters = pd.read_csv("letter-recognition.csv")


def hellinger_kernel(X, Y):
    # X /= (X.sum(axis=1, keepdims=True))
    # X = np.sqrt(X)
    # X /= np.linalg.norm(X, axis=1, ord=2)
    return np.sqrt(np.dot(X, Y.T))

def chi_sq_kernel(X, Y):
    sx = (X.sum(axis=1, keepdims=True))
    sy = ((Y).sum(axis=1, keepdims=True))
    print(sx.shape)
    print(sy.shape)
    return 2*np.dot(X, Y.T)/(sx+sy)

def intersection_kernel(X, Y):
    return np.inner(X, Y)

print("Dimensions: ", letters.shape, "\n")
print(letters.info())
letters.head()

letters.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey', 'yedge', 'yedgex']
print(letters.columns)


letters = letters[letters['letter'].isin(['A', 'B']) == True] 
# letters = letters[(letters['letter'].any('A', 'B'))]


order = list(np.sort(letters['letter'].unique()))
print(order)

letter_means = letters.groupby('letter').mean()
letter_means.head()
round(letters.drop('letter', axis=1).mean(), 2)

X = letters.drop("letter", axis = 1)
y = letters['letter']

X_scaled = scale(X, with_mean=False)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 101)

import random
print("Number of training samples = ", len(X_train))
print("Number of testing samples = ", len(X_test))

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)
y_pred = model_linear.predict(X_test)

print("Accuracy linear kernel without Markov Sampling : ", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

non_linear_model = SVC(kernel='rbf')
non_linear_model.fit(X_train, y_train)
y_pred = non_linear_model.predict(X_test)

print("Accuracy rbf kernel without Markov Sampling : ", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

non_linear_model = SVC(kernel='poly')
non_linear_model.fit(X_train, y_train)
y_pred = non_linear_model.predict(X_test)

print("Accuracy polynomial kernel without Markov Sampling : ", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

model_hell = SVC(kernel=hellinger_kernel)
model_hell.fit(X_train, y_train)
y_pred = model_hell.predict(X_test)

print("Accuracy Hellinger kernel without Markov Sampling : ", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

# Chi Squred SVM model

from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import AdditiveChi2Sampler

chi2sampler = AdditiveChi2Sampler(sample_steps=2)
X_transformed = chi2sampler.fit_transform(X_train, y_train)
clf = SGDClassifier(max_iter=100, random_state=0, tol=1e-3)
clf.fit(X_transformed, y_train)

print("Accuracy Chi Squared kernel without Markov Sampling : ", end = '')
clf.score(X_transformed, y_train)

model_hell = SVC(kernel=intersection_kernel)
model_hell.fit(X_train, y_train)
y_pred = model_hell.predict(X_test)

print("Accuracy Intersection kernel without Markov Sampling : ", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

import math
def markov_samp(X_train, Y_train, k = 5, q = 1.2):

#Taking N1 as 700
    # Init_sampl = np.concatenate((X_train, np.array()), axis = 0)
    Init_sampl = np.concatenate((np.vstack(X_train), np.vstack(Y_train.to_numpy())), axis = 1)
    Dtr = random.sample(list(Init_sampl), 800)

    m = len(Dtr)
    print("M : " , m)
    # mplus =np.count_nonzero(np.array(Dtr[16]) == 'A')
    # mplus = Dtr[16].count('A')
#Initializing mneg and mpos as 0
    mneg = 0
    mplus = 0

#Choosing first zt random sample from chosen training samples
    index = np.random.choice(len(Dtr), 1, replace=False)  
    Dtra = np.array(Dtr)
    print(type(Dtra))
    zt = Dtra[index][0]

    print("zt : ", zt)
    if m%2 == 0:
        if zt[16] == 'A':
            mplus += 1;
        else:
            mneg += 1

#Initializing empty sample holder
    samp = []

#Training SVM for choosing Markov CHain Samples
    model_linear = SVC(kernel='linear')#
    model_linear.fit(X_train, Y_train)#
    
    yzt = model_linear.predict(np.array(zt)[0:16].reshape(1, -1))
#Calculating loss for zt sample
    fxy1 = 1
    if not yzt==zt[16]:
        fxy1 = 2 
    lzt = math.exp(0-fxy1)

    Pd = 0
    Pdd = 0

#Loop for Markov sampling    
    while(mplus < m/2 or mneg < m/2 ):
    #Choosing zstar sample 
        zstar = Dtra[np.random.choice(len(Dtr), 1, replace=False)][0]
        ystar = model_linear.predict(np.array(zstar)[ 0:16].reshape(1, -1)) #
    #Calculating loss for zstar and P ratio
        fxy = 1
        if not ystar==zstar[16]:
            fxy = 2 
        lzstar = math.exp(0-fxy)
        P = math.exp(lzt-lzstar)
        # print(zt)
        yt = zt[16]
        zt = zstar

    #Accept/Reject
        if P == 1:
            if zt[16] == yt:
                r = random.uniform(0.001, 1.0)
                if r <= P:
                    samp.append(zstar) #Pdash = e−y∗f0 /e−ytf0
            else:
                ct = 1 if yt=='A' else -1
                cst = 1 if ystar=='A' else -1
                Pd = math.exp(ct*fxy1-cst*fxy)
                r = random.uniform(0.001, 1.0)
                if r <= Pd:
                    samp.append(zstar) #P

        if len(samp) == k:
            Pdd = q*P
            samp.append(zstar) #Pdd

    #Updating zt and mplus and mneg            
        ztp1 = zstar
        if yt == 'A':
            mplus += 1
        else:
            mneg += 1

        if P > 1 or Pd > 1 or Pdd > 1:
            samp.append(zstar)
    return samp


nsamp = np.array(markov_samp(X_train, y_train))

X_train = nsamp[:, 0:16]
y_train = nsamp[:, 16]
print("Shape of Markov Sample features : ", X_train.shape)
print("Shape of Markov Sample labels : ", y_train.shape)


# print(y_train)
#Converting label to ASCII in list for sklearn as sampling returned ndarray
Y_train = []
for i in y_train:
    Y_train.append(ord(i))
# print(Y_train)

print("", X_test.shape)


# print(y_pred)
#Converting label to ASCII in list for sklearn as training is done on ASCII
Y_test = []
for i in y_test:
    Y_test.append(ord(i))
# print(Y_test)


# Linear SVM model

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, Y_train)
y_pred = model_linear.predict(X_test)

print("Accuracy Linear kernel with Markov Sampling : ", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

non_linear_model = SVC(kernel='rbf')
non_linear_model.fit(X_train, Y_train)
y_pred = non_linear_model.predict(X_test)

print("Accuracy rbf kernel with Markov Sampling : ", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

non_linear_model = SVC(kernel='poly')
non_linear_model.fit(X_train, Y_train)
y_pred = non_linear_model.predict(X_test)


print("Accuracy polynomial kernel with Markov Sampling : ", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))



model_hell = SVC(kernel=hellinger_kernel)
model_hell.fit(X_train, Y_train)
y_pred = model_hell.predict(X_test)

print("Accuracy Hellinger kernel with Markov Sampling : ", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

chi2sampler = AdditiveChi2Sampler(sample_steps=2)
X_transformed = chi2sampler.fit_transform(X_train.astype(np.float), pd.Series(y_train))
clf = SGDClassifier(max_iter=100, random_state=0, tol=1e-3)
clf.fit(X_transformed, pd.Series(y_train))

print("Accuracy Chi Squared kernel without Markov Sampling : ", end = '')
clf.score(X_transformed, pd.Series(y_train))

model_hell = SVC(kernel=intersection_kernel)
model_hell.fit(X_train, Y_train)
y_pred = model_hell.predict(X_test)

print("Accuracy Intersection kernel with Markov Sampling : ", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

Dimensions:  (20000, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lettr   20000 non-null  object
 1   x-box   20000 non-null  int64 
 2   y-box   20000 non-null  int64 
 3   width   20000 non-null  int64 
 4   high    20000 non-null  int64 
 5   onpix   20000 non-null  int64 
 6   x-bar   20000 non-null  int64 
 7   y-bar   20000 non-null  int64 
 8   x2bar   20000 non-null  int64 
 9   y2bar   20000 non-null  int64 
 10  xybar   20000 non-null  int64 
 11  x2ybr   20000 non-null  int64 
 12  xy2br   20000 non-null  int64 
 13  x-ege   20000 non-null  int64 
 14  xegvy   20000 non-null  int64 
 15  y-ege   20000 non-null  int64 
 16  yegvx   20000 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB
None
Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy