#K-Markov Sampling for Letters dataset

In [None]:
# Import all dependencies
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale

In [None]:
# mount drive for easy import and export of data
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# initialise dataframe with letter dataset
letters = pd.read_csv("letter-recognition.csv")
letters.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']

Step-I

In [None]:
# initialise parameters
markov= pd.DataFrame(columns = letters.columns)
uniqChar=list(np.sort(letters['letter'].unique()))
classCNT=len(uniqChar)
limit=100
m=classCNT*limit
charNo={}
c=0
for i in uniqChar:
    charNo[i]=c
    c+=1
mAZ={i:0 for i in uniqChar}

In [None]:
# Chose parameters for markov sampling
k=5
q=1.2
acc=0

In [None]:
# Train a linear Model on N[here 2000] size train set
X = letters.drop("letter", axis = 1)
y = letters['letter']

X_scaled = scale(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.9, random_state = 101)
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_test)

In [None]:
y_pred

array(['P', 'E', 'M', ..., 'Q', 'F', 'X'], dtype=object)

Step-II

In [None]:
# Chosing a random sample as first of markov chain
i=np.random.randint(letters.shape[0])
z0=letters.iloc[i]
y0=model_linear.predict(np.array([z0.drop('letter')]))[0]
if m%classCNT==0:
    mAZ[z0['letter']]+=1

In [None]:
d={}
for i,val in z0.items():
    print(i,val)
    d[i]=val
markov.append(d,ignore_index=True)
markov

letter K
xbox 8
ybox 13
width 8
height 7
onpix 4
xbar 9
ybar 7
x2bar 3
y2bar 9
xybar 7
x2ybar 3
xy2bar 6
xedge 5
xedgey 4
yedge 8
yedgex 8


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex


In [None]:
predProb=[]

In [None]:
# Utility Function for loop condition
def exist(dic,limit):
    for i,val in dic.items():
        if val<limit:
            return True
    return False            

In [None]:
# Utility loss Function
def lossF(actual,pred):
    if actual==pred:
        return 1.0
    return np.exp(-2)

In [None]:
# Utility Function for getting class index
def getNo(ch):
    return charNo[ch]

In [None]:
# Utility function for training subsequent models
def train(data):
    
    X = data.drop("letter", axis = 1)
    y = data['letter']

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 101)
    model_linear = SVC(kernel='linear')
    model_linear.fit(X_train, y_train)

    return model_linear

In [None]:
lst=[]

Step-III TO Step-VII

In [None]:
# Loop markov chain generator k times
for km in range(2):
    # Reset parameters for next markov chain
    if km!=0:
        predProb=[]
        model_linear=train(markov)
        markov= pd.DataFrame(columns = letters.columns)
        mAZ={i:0 for i in uniqChar}
    lst=[]
    
    # Chosing a random sample as first of markov chain
    i=np.random.randint(letters.shape[0])
    z0=letters.iloc[i]
    y0=model_linear.predict(np.array([z0.drop('letter')]))[0]
    if m%classCNT==0:
        mAZ[z0['letter']]+=1
    print("Entering...")
    while exist(mAZ,limit):
        # choosing a random sample
        i=np.random.randint(letters.shape[0])
        while i in lst:
            i=np.random.randint(letters.shape[0])
        z1=letters.iloc[i]
        y1=model_linear.predict(np.array([z1.drop('letter')]))[0]
        n=lossF(z1['letter'],y1)
        d=lossF(z0['letter'],y0)
        p=n/d
        
        # Deciding of acceptance of chosen sample and its probability in markov chain
        if acc==k:
            acc=0
            p2=q*p
            p2=min(p2,1)
            predProb.append([z1['letter'],y1,p2])
            markov=markov.append(z1)
            z0=z1
            mAZ[z1['letter']]+=1
            acc+=1
            lst.append(i)
        elif p==1 and z0['letter']==z1['letter']:
            n=np.exp(-getNo(y1)*getNo(z1['letter']))
            d=np.exp(-getNo(y0)*getNo(z0['letter']))

            p1=n/d
            p1=min(p1,1)
            predProb.append([z1['letter'],y1,p1])
            markov=markov.append(z1)
            z0=z1
            mAZ[z1['letter']]+=1
            acc+=1
            lst.append(i)
        elif p<1:
            predProb.append([z1['letter'],y1,p])
            markov=markov.append(z1)
            z0=z1
            mAZ[z1['letter']]+=1
            acc+=1
            lst.append(i)
        elif p==1 and z0['letter']!=z1['letter']:
            predProb.append([z1['letter'],y1,p])
            markov=markov.append(z1)
            z0=z1
            mAZ[z1['letter']]+=1
            acc+=1
            lst.append(i)

Entering...
Entering...


In [None]:
markov

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
17428,O,6,10,7,7,5,8,6,8,10,6,6,9,4,5,8,5
8681,Z,4,6,5,4,3,6,9,3,11,8,9,6,2,6,9,5
1092,K,8,15,8,8,5,3,8,4,10,6,11,11,5,4,11,7
16779,R,3,10,4,7,3,6,9,11,7,5,5,8,3,5,8,10
1947,E,4,7,6,5,4,6,8,3,11,7,8,8,3,4,8,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14120,K,5,11,7,8,9,7,6,3,6,5,6,8,7,7,7,7
8158,S,7,10,8,7,4,8,8,5,11,10,2,8,2,5,5,10
3836,X,4,4,6,3,3,5,8,2,11,9,10,9,3,3,7,6
18434,A,3,4,5,3,2,10,2,3,9,1,2,9,2,2,6,8


In [None]:
predProb

[['O', 'D', 1.0],
 ['Z', 'S', 1.0],
 ['K', 'C', 1.0],
 ['R', 'H', 1.0],
 ['E', 'E', 1],
 ['B', 'I', 1.0],
 ['B', 'B', 1.0],
 ['W', 'W', 1.0],
 ['B', 'R', 1.0],
 ['O', 'O', 1],
 ['O', 'Q', 6.282880511239462e-92],
 ['K', 'A', 1.0],
 ['M', 'U', 1.0],
 ['N', 'D', 1.0],
 ['W', 'W', 1],
 ['Q', 'O', 1.0],
 ['P', 'F', 1.0],
 ['X', 'R', 1.0],
 ['L', 'Q', 1.0],
 ['S', 'X', 1],
 ['X', 'B', 1.0],
 ['O', 'H', 1.0],
 ['S', 'Z', 1.0],
 ['J', 'I', 1.0],
 ['R', 'R', 1],
 ['H', 'K', 1.0],
 ['B', 'V', 1.0],
 ['A', 'A', 1.0],
 ['P', 'F', 1.0],
 ['W', 'W', 1],
 ['Q', 'O', 1.0],
 ['J', 'F', 1.0],
 ['F', 'P', 1.0],
 ['D', 'N', 1.0],
 ['W', 'W', 1],
 ['H', 'R', 1.0],
 ['R', 'N', 1.0],
 ['O', 'P', 1.0],
 ['S', 'I', 1.0],
 ['W', 'W', 1],
 ['F', 'I', 1.0],
 ['U', 'N', 1.0],
 ['H', 'J', 1.0],
 ['Y', 'V', 1.0],
 ['O', 'O', 1],
 ['Q', 'E', 1.0],
 ['S', 'J', 1.0],
 ['P', 'G', 1.0],
 ['K', 'R', 1.0],
 ['J', 'J', 1],
 ['I', 'D', 1.0],
 ['L', 'I', 1.0],
 ['M', 'U', 1.0],
 ['Y', 'V', 1.0],
 ['C', 'C', 1],
 ['X', 'K', 1.

Save data from generated markov chain

In [None]:
markov.to_csv("KmarkovSamplesLetters.csv")

In [None]:
prob=[]
for i in predProb:
    prob.append(i[2])

In [None]:
markov['probability']=prob

In [None]:
markov.to_csv("KmarkovSamplesLettersProbability.csv")

In [None]:
for i in lst:
    letters=letters.drop([i])
letters.to_csv('KmarkovLettersRemaining.csv')

# SVM

In [None]:
train = pd.read_csv("KmarkovSamplesLetters.csv")
test = pd.read_csv("KmarkovLettersRemaining.csv")

In [None]:
train = train.drop(train.columns[[0]], axis=1)
test = test.drop(test.columns[[0]], axis=1)
train.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']
test.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']

In [None]:
X_train = train.drop("letter", axis = 1)
y_train = train["letter"]

X_test = test.drop("letter", axis = 1)
y_test = test["letter"]

Linear kernel

In [None]:
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

# predict

y_pred = model_linear.predict(X_test)
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

accuracy: 0.8121266619564519 



RBF kernel

In [None]:
model_linear = SVC(kernel='rbf')
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_test)
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

accuracy: 0.8694842314856446 



Chi-squared kernel

In [None]:
from sklearn.metrics.pairwise import chi2_kernel

model_linear = SVC(kernel=chi2_kernel)
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

accuracy: 0.8495728691630805 



Hellinger kernel

In [None]:
def hellinger(X1, X2):

  return np.sqrt(np.dot(X1,X2.T))
   

model_linear = SVC(kernel=hellinger)
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_test)
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

accuracy: 0.709229879889524 



Intersection kernel

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

def intersection(X1,X2):

  # X1= n1 x m
  # X2= n2 x m
  # result= n1xn2

  result = np.zeros((X1.shape[0],X2.shape[0]))
  X2=X2.T

  for i in range(len(X1)):
    # iterate through columns of Y
    for j in range(len(X2[0])):
      # iterate through rows of Y
      val=float('+inf')
      for k in range(len(X2)):
        val = min(val,X1[i][k] * X2[k][j])

      result[i][j]=val

  return result
 

model_linear = SVC(kernel=intersection)
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_test)
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# Taking too much time.

accuracy: 0.01920483011111825 

