In [24]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
from numpy.linalg import inv

In [25]:
os.chdir('/Users/sreelakshmirajendrakumar/Downloads')

dataset = pandas.read_csv('LDA_1.csv')
y_data=dataset.iloc[:,0]
y_data=y_data.to_numpy() # shape (4999,)
X_data=dataset.iloc[:,1:]
X_data=X_data.to_numpy() # shape (4999,10)

#appending the missing row from the dataset
firstValues=list(dataset.columns.values)
for i in range(0,11):
    firstValues[i]=float(firstValues[i])
X_data=np.concatenate(([firstValues[1:]],X_data),axis=0) # shape (5000,10)
y_data=np.append(firstValues[0],y_data) #shape (5000,)
y_data=y_data.astype(int)


In [26]:
# when N=20
X_test20 = X_data[0:5000-20] # shape (4980,10)
X_train20 = X_data[5000-20:] # shape (20,10)
y_test20 = y_data[0:5000-20] # shape (4980,)
y_train20 = y_data[5000-20:] # shape (20,)

# when N=50
X_test50 = X_data[0:5000-50] # shape (4950,10)
X_train50 = X_data[5000-50:] # shape (50,10)
y_test50 = y_data[0:5000-50] # shape (4950,)
y_train50 = y_data[5000-50:] # shape (50,)

# when N=100
X_test100 = X_data[0:5000-100] # shape (4900,10)
X_train100 = X_data[5000-100:] # shape (100,10)
y_test100 = y_data[0:5000-100] # shape (4900,)
y_train100 = y_data[5000-100:] # shape (100,)

# when N=200
X_test200 = X_data[0:5000-200] # shape (4800,10)
X_train200 = X_data[5000-200:] # shape (200,10)
y_test200 = y_data[0:5000-200] # shape (4800,)
y_train200 = y_data[5000-200:] # shape (200,)

# when N=1000
X_test1000 = X_data[0:5000-1000] # shape (4000,10)
X_train1000 = X_data[5000-1000:] # shape (1000,10)
y_test1000 = y_data[0:5000-1000] # shape (4000,)
y_train1000 = y_data[5000-1000:] # shape (1000,)

# when N=2500
X_test2500 = X_data[0:5000-2500] # shape (2500,10)
X_train2500 = X_data[5000-2500:] # shape (2500,10)
y_test2500 = y_data[0:5000-2500] # shape (2500,)
y_train2500 = y_data[5000-2500:] # shape (2500,)



In [27]:
def meanX(X): # X is an Nxd (or N_ixd) matrix
    # finding the mean over the samples in the array given as input
    X=np.array(X)
    mean=[]
    (N,d)=np.shape(X)
    for i in range(0,d):
        temp=((np.sum(X[:,i]))/N)
        mean.append([temp])
    return mean # dx1 matrix

def separateClass(X,y):
    # separates the main array into classes so that the length of this array
    #will be the number of classes, shape of each element in this array will 
    #be (N_i,d) where N_i is the number of samples in  i^th class
    (N,d)=np.shape(X)
    q=np.max(y)-np.min(y)+1 # the number of classes
    x_separated=[]
    for i  in range(0,q):
        x_separated.append([])
    for c in range(1,q+1):
        for i in range(0,N):
            if y[i]==c:
                x_separated[c-1].append(X[i])
    return x_separated

def meanDiff(X,y):
    #gets the matrix (mu_g-mu)
    X=np.array(X)
    (N,d)=np.shape(X)
    q=np.max(y)-np.min(y)+1
    x_separated=separateClass(X,y)
    theMean=meanX(X)
    theMean=np.array(theMean).T[0]
    means=[] # to collect the mean for each class
    for i in range(0,q):
        temp=np.array(meanX(x_separated[i])).T
        means.append(temp[0])
    meanDifference=means-theMean
    xMeanDifference=np.array(X)-theMean
        
    return meanDifference.T,xMeanDifference.T # shapes (d,q), (d,N) ==> this will go as the input for finding the gamma values
    
def withinClassScatter(X,y): # input: x_separated
    (N,d)=np.shape(X)
    q=np.max(y)-np.min(y)+1
    x_separated=separateClass(X,y)
    withinScatter=np.zeros((d,d))
    for i in range(0,q):
        x_separated[i]=np.array(x_separated[i]) # 1xd matrix
        temp=x_separated[i].T # dx1 matrix
        temp1=meanX(x_separated[i]) #dx1 matrix
        temp2=np.subtract(temp,temp1)
        temp3=np.matmul(temp2,temp2.T) # dxd matrix
        withinScatter+=temp3
    return withinScatter/N

def betweenClassScatter(X,y):
    (N,d)=np.shape(X)
    q=np.max(y)-np.min(y)+1
    x_separated=separateClass(X,y)
    x_mean=meanX(X) # dx1 matrix
    x_separatedMeans=[]
    theCount=[]
    betweenScatter=np.zeros((d,d))
    for i in range(0,q):
        count=0
        x_separatedMeans.append(meanX(x_separated[i])) # this will be a matrix with length q when the for loop ends
        for j in range(0,len(x_separated[i])):
            count+=1
        theCount.append(count) # array that has the number of elements of each class
    
    for i in range(0,q):
        temp=np.subtract(x_separatedMeans[i],x_mean)
        temp2=np.matmul(temp,temp.T)
        betweenScatter+=temp2
    return betweenScatter/N # dxd matrix


def LDATrain(X,y):
    (N,d)=np.shape(X)
    globalAverage=meanX(X)
    q=np.max(y)-np.min(y)+1
    r=q-1
    withinScatter=withinClassScatter(X,y)
    betweenScatter=betweenClassScatter(X,y)
    temp=np.linalg.inv(withinScatter)
    temp1=np.matmul(temp,betweenScatter)
    tempWeight,tempVec=np.linalg.eig(temp1) # tempVec[:,i] is the eigenvector for tempWeight[i]
    # sorting step
    tempWeightSorted=sorted(tempWeight,reverse=True)
    tempVector=tempVec.T
    tempVecSorted=[]
    for i in range(0,len(tempWeight)):
        for j in range(0,len(tempWeight)):
            if tempWeightSorted[i]==tempWeight[j]:
                tempVecSorted.append(tempVector[j])
    eigenVectors=tempVecSorted[0:r] # first r eigen vectors
    eigenVectors=np.array(eigenVectors).T 
    # computing (mu_g-mu).T x e_j
    difference,x_difference=meanDiff(X,y) # dxq and dxN
    difference=np.array(difference).T
    x_difference=np.array(x_difference).T
    m_c=[] # here the first row (for both xm_c and m_c) will have the multiplication by the first eigen vector, second will have that by the second vector, and so on    
    for i in range(0,r):
        m_c.append([])
        for g in range(0,q):
            temp3=np.matmul(difference[g],eigenVectors.T[i])
            m_c[i].append(temp3)
        #m_c=np.matmul(difference.T,eigenVectors)
    
    return eigenVectors,globalAverage,m_c # dxr ,dx1, rxq, rxN


def LDAClassify(X,y,X_test,y_test):
    predictedY=[]
    (N,d)=np.shape(X)
    (N_test,d_test)=np.shape(X_test)
    q=np.max(y)-np.min(y)+1
    r=q-1
    x_separated=separateClass(X,y)
    eigenVectors,globalAverage,m_c=LDATrain(X,y)
    pred=[]
    xm_c=[]
    difference,x_difference=meanDiff(X_test,y_test) # dxq and dxN
    difference=np.array(difference).T
    x_difference=np.array(x_difference).T
    for i in range(0,r):
        xm_c.append([])
        for j in range(0,N_test):
            temp2=np.matmul(x_difference[j],eigenVectors.T[i])
            xm_c[i].append(temp2) # xm_c is now an rxN matrix
    for i in range(0,N_test):
        toPred=[]
        for g in range(0,q):
            pi=len(x_separated[g]) # getting the value of pi
            temp=0
            temp1=0
            for j in range(0,r):
                temp+=xm_c[j][i]*m_c[j][g]
                temp1+=m_c[j][g]**2
            temp1/=2
            temp2=temp-temp1+np.log(pi)
            toPred.append(temp2)
        ind=np.argmax(toPred)
        pred.append(ind)
    return np.array(pred)+1           



In [28]:
'''Applying LDA when N=20'''
yPredict_20=LDAClassify(X_train20,y_train20,X_test20,y_test20)

'''Applying LDA when N=50'''
yPredict_50=LDAClassify(X_train50,y_train50,X_test50,y_test50)

'''Applying LDA when N=100'''
yPredict_100=LDAClassify(X_train100,y_train100,X_test100,y_test100)

'''Applying LDA when N=200'''
yPredict_200=LDAClassify(X_train200,y_train200,X_test200,y_test200)

'''Applying LDA when N=1000'''
yPredict_1000=LDAClassify(X_train1000,y_train1000,X_test1000,y_test1000)

'''Applying LDA when N=2500'''
yPredict_2500=LDAClassify(X_train2500,y_train2500,X_test2500,y_test2500)
# Provide classification results (in the test set and the training set) when 
#the training set contains the N first samples in the file, and the test set 
#contains the 5000 − N remaining ones, with N = 20, 50, 100, 200, 1000, 2500

In [29]:
print('The predicted classes for the test set when N=20 are:\n')
print(yPredict_20)
print('The true classes for the test set when N=20 are:\n')
print(y_test20)
print('The difference is given by:\n')
print(yPredict_20-y_test20)
print('\nThere are %d misclassified sample(s) here'%len(np.nonzero((yPredict_20-y_test20))[0]))

The predicted classes for the test set when N=20 are:

[1 1 2 ... 3 3 1]
The true classes for the test set when N=20 are:

[1 1 2 ... 2 3 1]
The difference is given by:

[0 0 0 ... 1 0 0]

There are 715 misclassified sample(s) here


In [30]:
print('The predicted classes for the test set when N=50 are:\n')
print(yPredict_50)
print('The true classes for the test set when N=50 are:\n')
print(y_test50)
print('The difference is given by:\n')
print(yPredict_50-y_test50)
print('\nThere are %d misclassified sample(s) here'%len(np.nonzero((yPredict_50-y_test50))[0]))

The predicted classes for the test set when N=50 are:

[1 1 2 ... 3 2 2]
The true classes for the test set when N=50 are:

[1 1 2 ... 3 2 2]
The difference is given by:

[0 0 0 ... 0 0 0]

There are 289 misclassified sample(s) here


In [31]:
print('The predicted classes for the test set when N=100 are:\n')
print(yPredict_100)
print('The true classes for the test set when N=100 are:\n')
print(y_test100)
print('The difference is given by:\n')
print(yPredict_100-y_test100)

print('\nThere are %d misclassified sample(s) here'%len(np.nonzero(abs(yPredict_100-y_test100))[0]))

The predicted classes for the test set when N=100 are:

[1 1 2 ... 3 2 2]
The true classes for the test set when N=100 are:

[1 1 2 ... 3 2 2]
The difference is given by:

[0 0 0 ... 0 0 0]

There are 178 misclassified sample(s) here


In [32]:
print('The predicted classes for the test set when N=200 are:\n')
print(yPredict_200)
print('The true classes for the test set when N=200 are:\n')
print(y_test200)
print('The difference is given by:\n')
print(yPredict_200-y_test200)
print('\nThere are %d misclassified sample(s) here'%len(np.nonzero((yPredict_200-y_test200))[0]))

The predicted classes for the test set when N=200 are:

[1 1 2 ... 3 1 2]
The true classes for the test set when N=200 are:

[1 1 2 ... 3 1 2]
The difference is given by:

[0 0 0 ... 0 0 0]

There are 121 misclassified sample(s) here


In [33]:
print('The predicted classes for the test set when N=1000 are:\n')
print(yPredict_1000)
print('The true classes for the test set when N=1000 are:\n')
print(y_test1000)
print('The difference is given by:\n')
print(yPredict_1000-y_test1000)
print('\nThere are %d misclassified sample(s) here'%len(np.nonzero((yPredict_1000-y_test1000))[0]))

The predicted classes for the test set when N=1000 are:

[1 1 2 ... 1 3 3]
The true classes for the test set when N=1000 are:

[1 1 2 ... 1 3 3]
The difference is given by:

[0 0 0 ... 0 0 0]

There are 84 misclassified sample(s) here


In [36]:
print('The predicted classes for the test set when N=2500 are:\n')
print(yPredict_2500)
print('The true classes for the test set when N=2500 are:\n')
print(y_test2500)
print('The difference is given by:\n')
print(yPredict_2500-y_test2500)
print('\nThere are %d misclassified sample(s) here'%len(np.nonzero((yPredict_2500-y_test2500))[0]))

The predicted classes for the test set when N=2500 are:

[1 1 2 ... 1 2 3]
The true classes for the test set when N=2500 are:

[1 1 2 ... 1 2 3]
The difference is given by:

[0 0 0 ... 0 0 0]

There are 47 misclassified sample(s) here
