In [2]:
from sklearn.cluster import KMeans
import numpy as np
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [3]:
maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03 #Lambda value (one of the hyper parameter)
TrainingPercent = 80 #80% of the data is taken as the training dataset
ValidationPercent = 10  #10% of the data is taken as the validation dataset
TestPercent = 10 #The remaining 10% of the data is taken as the testing dataset
M = 13 #no of clusters to be formed
PHI = [] #The basis function matrix
IsSynthetic = False

In [4]:
def GetTargetVector(filePath):
    #this function retrieves the target value vector (x) from the .csv file
    t = []
    with open(filePath, 'rU') as f:
        reader = csv.reader(f)
        for row in reader:  
            t.append(int(row[0]))
    #print("Raw Training Generated..")
    return t

def GenerateRawData(filePath, IsSynthetic):  
     #this function retrieves the input values (x) from the .csv file   
    dataMatrix = [] 
    with open(filePath, 'rU') as fi:
        reader = csv.reader(fi)
        for row in reader:
            dataRow = []
            for column in row:
                dataRow.append(float(column))
            dataMatrix.append(dataRow)   
    
    if IsSynthetic == False : #issynthetic is a variable that is created by the compiler that doesnt involve in the actual code
        dataMatrix = np.delete(dataMatrix, [5,6,7,8,9], axis=1)
    dataMatrix = np.transpose(dataMatrix)     
    #print ("Data Matrix Generated..")
    return dataMatrix

def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    #this function divides the target value vector by 80% to the training target vector
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    #print(str(TrainingPercent) + "% Training Target Generated..")
    return t

def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
     #this function divides the input values x by 80% to the training data
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    #print(str(TrainingPercent) + "% Training Data Generated..")
    return d2

def GenerateValData(rawData, ValPercent, TrainingCount): 
    #this function divides the input values x by 10% to the validation data
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Data Generated..")  
    return dataMatrix

def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    #this function divides the target vector t by 10% to the validation target vector
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    t =rawData[TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Target Data Generated..")
    return t

def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic):
    # this function computes the Σ(x−μ) part of the basis function which is basically the variance computation
    BigSigma    = np.zeros((len(Data),len(Data)))
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01)) #converting percentage to integer 
    #taking the length of the dataset (80% for training, 10% for validation, 10% for the testing)
    varVect     = []
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
        #the bigsigma matrix consists of the variance values only on the diagonal of the matrix where the remaining values are 0
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    ##print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    #this function computes the  (x−μ)TΣ-1(x−μ) part of the basis function by subtracting the mean from the value x and taking the transpose of the resultant row 
    # and multiplying with the row of the inverse of bigsigma i.e. the result from the GenerateBigSigma function
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    #this function returns the value of the gradiant radial basis function
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    #this function creates as PHI-Matrix which is a matrix consisting of the basis function values as its elements
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))         
    PHI = np.zeros((int(TrainingLen),len(MuMatrix))) 
    BigSigInv = np.linalg.inv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv) #φ(x) calculation
    #print ("PHI Generated..")
    return PHI

def GetWeightsClosedForm(PHI, T, Lambda):
    #this function calculates the W = (λI+ØT Ø)-1 ØTt equation

    Lambda_I = np.identity(len(PHI[0])) # lambds identity matrix (λI)
    for i in range(0,len(PHI[0])):
        Lambda_I[i][i] = Lambda
    PHI_T       = np.transpose(PHI)    #transpose of Ø
    PHI_SQR     = np.dot(PHI_T,PHI)    # equation ØT Ø
    PHI_SQR_LI  = np.add(Lambda_I,PHI_SQR)#λI+ØT Ø
    PHI_SQR_INV = np.linalg.inv(PHI_SQR_LI) # inverse of λI+ØT Ø
    INTER       = np.dot(PHI_SQR_INV, PHI_T) # (λI+ØT Ø)-1 ØT
    W           = np.dot(INTER, T) #final W value
    ##print ("Training Weights Generated..")
    return W

def GetValTest(VAL_PHI,W):
    #y(x,w) = wT * φ(x) calculation which is the linear regression function
    Y = np.dot(W,np.transpose(VAL_PHI))    # wT * φ(x) calculation
    ##print ("Test Out Generated..")
    return Y

def GetErms(VAL_TEST_OUT,ValDataAct):
    #calculation of the root mean square error defined as
    # ERMS = 􏰆pow(2E(w∗)/NV,2)
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

## Fetch and Prepare Dataset

In [5]:
RawTarget = GetTargetVector('Querylevelnorm_t.csv')
RawData   = GenerateRawData('Querylevelnorm_X.csv',IsSynthetic)
print(RawData)

  after removing the cwd from sys.path.


FileNotFoundError: [Errno 2] No such file or directory: 'Querylevelnorm_t.csv'

## Prepare Training Data

In [16]:
TrainingTarget = np.array(GenerateTrainingTarget(RawTarget,TrainingPercent))
TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
print(TrainingTarget.shape)
print(TrainingData.shape)

(55699,)
(41, 55699)


## Prepare Validation Data

In [17]:
ValDataAct = np.array(GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget))))
ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
print(ValDataAct.shape)
print(ValData.shape)

(6962,)
(41, 6962)


## Prepare Test Data

In [18]:
TestDataAct = np.array(GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct))))
TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
print(ValDataAct.shape)
print(ValData.shape)

(6962,)
(41, 6962)


## Closed Form Solution [Finding Weights using Moore- Penrose pseudo- Inverse Matrix]

In [19]:
ErmsArr = []
AccuracyArr = []
#clustering and mean calculation using the KMeans algorithm from the training data
kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData))
Mu = kmeans.cluster_centers_

BigSigma     = GenerateBigSigma(RawData, Mu, TrainingPercent,IsSynthetic) #bigsigma function call on the raw data for the training dataset 
TRAINING_PHI = GetPhiMatrix(RawData, Mu, BigSigma, TrainingPercent) #the PHI-Matrix calculation
W            = GetWeightsClosedForm(TRAINING_PHI,TrainingTarget,(C_Lambda)) #weights calculation
TEST_PHI     = GetPhiMatrix(TestData, Mu, BigSigma, 100) #phi- matrix for the testing dataset
VAL_PHI      = GetPhiMatrix(ValData, Mu, BigSigma, 100) #phi-matrix for the validation dataset

In [20]:
print(Mu.shape)
print(BigSigma.shape)
print(BigSigma)
print(TRAINING_PHI.shape)
print(TRAINING_PHI)
print(W.shape)
print(VAL_PHI.shape)
print(TEST_PHI.shape)

(13, 41)
(41, 41)
[[1.10140157e+01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.30326118e+01 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.33558239e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.41323671e+01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.23814043e+01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 3.59066427e-03]]
(55699, 13)
[[0.79372611 0.73109705 0.81817934 ... 0.80850518 0.97551599 0.80082838]
 [0.84232616 0.83462383 0.88485286 ... 0.78494303 0.73028441 0.84267422]
 [0.93354531 0.86331906 0.89669267 ... 0.92019345 0.85876407 0.9509906 ]
 ...
 [0.94027817 0.91949963 0.86474938 ... 0.89159314 0.83309511 0.94718168]
 [0.77112054 0.87069143 0.76303297 ... 0.6831929  0.59703667 0.79689493]


## Finding Erms on training, validation and test set 

In [21]:
TR_TEST_OUT  = GetValTest(TRAINING_PHI,W) #linear regression calculation on the training data
VAL_TEST_OUT = GetValTest(VAL_PHI,W)#linear regression calculation on the validation data
TEST_OUT     = GetValTest(TEST_PHI,W)#linear regression calculation on the test data

TrainingAccuracy   = str(GetErms(TR_TEST_OUT,TrainingTarget))
ValidationAccuracy = str(GetErms(VAL_TEST_OUT,ValDataAct))
TestAccuracy       = str(GetErms(TEST_OUT,TestDataAct))

In [22]:
print ('UBITname      = saikalya')
print ('Person Number = 50292522')
print ('----------------------------------------------------')
print ("------------------LeToR Data------------------------")
print ('----------------------------------------------------')
print ("-------Closed Form with Radial Basis Function-------")
print ('----------------------------------------------------')
print ("M = "+str(M)+" \nLambda = "+str(C_Lambda))
print ("E_rms Training   = " + str(float(TrainingAccuracy.split(',')[1])))
print ("E_rms Validation = " + str(float(ValidationAccuracy.split(',')[1])))
print ("E_rms Testing    = " + str(float(TestAccuracy.split(',')[1])))

UBITname      = saikalya
Person Number = 50292522
----------------------------------------------------
------------------LeToR Data------------------------
----------------------------------------------------
-------Closed Form with Radial Basis Function-------
----------------------------------------------------
M = 13 
Lambda = 0.03
E_rms Training   = 0.5474660225183801
E_rms Validation = 0.5379257255209654
E_rms Testing    = 0.627720731200836


## Gradient Descent solution for Linear Regression

In [261]:
print ('----------------------------------------------------')
print ('--------------Please Wait for 2 mins!----------------')
print ('----------------------------------------------------')

----------------------------------------------------
--------------Please Wait for 2 mins!----------------
----------------------------------------------------


In [23]:
W_Now        = np.dot(220, W)
La           = 2 #lambda value for the stochastic gradient descent solution
learningRate = 0.01 
L_Erms_Val   = []
L_Erms_TR    = []
L_Erms_Test  = []
W_Mat        = []

for i in range(0,400):
    #∇E = ∇ED + λ∇EW computation
    #print ('---------Iteration: ' + str(i) + '--------------')
    Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
    # ∇ED calculation where ∇ED =−(t −w⊤φ(x))φ(x)
    La_Delta_E_W  = np.dot(La,W_Now) #λ∇EW
    Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
    Delta_W       = -np.dot(learningRate,Delta_E) #∆w = −η∇E 
    W_T_Next      = W_Now + Delta_W
    W_Now         = W_T_Next
    
    #Erms values for the training, validation and the testing data
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
    Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
    L_Erms_TR.append(float(Erms_TR.split(',')[1]))
    
    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
    Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
    L_Erms_Val.append(float(Erms_Val.split(',')[1]))
    
    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
    Erms_Test = GetErms(TEST_OUT,TestDataAct)
    L_Erms_Test.append(float(Erms_Test.split(',')[1]))

In [24]:
print ('----------Gradient Descent Solution--------------------')
print ("M = "+str(M)+" \nLambda  = "+str(La)+"\neta=0.01")
print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))

----------Gradient Descent Solution--------------------
M = 13 
Lambda  = 2
eta=0.01
E_rms Training   = 0.54757
E_rms Validation = 0.53805
E_rms Testing    = 0.62369
