In [406]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd 
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [407]:
'''
Reading the data set which has both target data and the features, and then removing the synthetic
data features
''' 
data = pd.read_csv("combined.csv")
data = data.drop(["X_6","X_7","X_8","X_9","X_10"],axis=1)

In [408]:
'''
First clustering the data and predicting the class labels for each of the instance
'''
M = 10
kmeans = KMeans(n_clusters=M, random_state=421)
kmeans_data = kmeans.fit_predict(data.iloc[:,1:])
data = data.join(pd.DataFrame(kmeans_data,columns=["kmean_cluster_number"]))

'''
2D stratified sampling on the target value and the cluster number so that the algorithm which we will 
implement will have fair chances of learning all types of data.
'''
train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["Target","kmean_cluster_number"]],random_state=42)
val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["Target","kmean_cluster_number"]],random_state=24)

'''
Cluster number is not required now
'''
train = train.drop(["kmean_cluster_number"],axis=1)
test = test.drop(["kmean_cluster_number"],axis=1)
val = val.drop(["kmean_cluster_number"],axis=1)

mu = kmeans.cluster_centers_

In [359]:
'''
Splitting the labels form the data to train the data
'''
train_lab = train.iloc[:,0]
trainData = train.iloc[:,1:]
test_lab = test.iloc[:,0]
testData = test.iloc[:,1:]
val_lab = val.iloc[:,0]
valData = val.iloc[:,1:]


In [360]:
num_basis = len(mu)

def covar(trainData,num_basis):
    ''' 
    Getting the covar over the training data based on number of basics we have implemented
    Changed the spread for Gaussian radial basis function
    '''
    train_transpose = np.transpose(trainData)
    iden = np.identity(np.shape(train_transpose)[0])
    holdResult = []
    for i in range(0,np.shape(train_transpose)[0]):
        holdRow = []
        for j in range(0,len(trainData)):
            holdRow.append(train_transpose.iloc[i,j])
        iden[i] = np.dot(iden[i],np.dot(np.dot(200,i),np.var(holdRow)))
        #iden[i] = np.dot(iden[i],np.var(holdRow))
    return iden

covarMat = covar(trainData,num_basis)

In [361]:
def genPhi(train,covarMat,num_basis):
    '''
    Getting the Phi based on the covariance and number of basis
    '''
    phiMat = np.zeros((len(train),int(num_basis))) 
    covarMatInv = np.linalg.pinv(covarMat)
    for i in range(0,num_basis):
        for j in range(0,len(train)):
            subsResult = (np.subtract(train.iloc[j,],mu[i,]))
            L = np.dot(np.transpose(subsResult),covarMatInv)
            R = np.dot(L,subsResult)
            phiMat[j][i] = math.exp(-np.dot(0.5,R))
    return phiMat


In [362]:
print("Computing Phi will take lot of time. Please wait...")

Computing Phi will take lot of time. Please wait...


In [363]:
# Gen Phi Data
phiMat = genPhi(trainData,covarMat,num_basis)
test_phi = genPhi(testData,covarMat,num_basis)
val_phi = genPhi(valData,covarMat,num_basis)

In [364]:
print("Calculated Phi Matrix")

Calculated Phi Matrix


In [381]:
lam =0.01

def getWeights(train_lab,phiMat,lam):
    '''
    Getting the weights based on the Lambda and phi matrix
    '''
    iden = np.identity(len(phiMat[0]))
    ft = np.dot(iden,lam)
    st = np.dot(np.transpose(phiMat),phiMat)
    fmt = np.add(ft,st)
    fmt = np.linalg.pinv(fmt)
    smt = np.dot(np.transpose(phiMat),np.asarray(train_lab))
    wReg = np.dot(fmt,smt)
    return wReg

weights = getWeights(train_lab,phiMat,lam)

In [382]:
# Using same functions as the TA code had to check accuracy
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y



In [383]:
TR_TEST_OUT  = GetValTest(phiMat,weights)
VAL_TEST_OUT = GetValTest(val_phi,weights)
TEST_OUT     = GetValTest(test_phi,weights)

TrainingAccuracy   = str(GetErms(TR_TEST_OUT,np.asarray(train_lab)))
ValidationAccuracy = str(GetErms(VAL_TEST_OUT,np.asarray(val_lab)))
TestAccuracy       = str(GetErms(TEST_OUT,np.asarray(test_lab)))

In [384]:
print ('UBITname      = XXXXXXXX')
print ('Person Number = YYYYYYYY')
print ('----------------------------------------------------')
print ("------------------LeToR Data------------------------")
print ('----------------------------------------------------')
print ("-------Closed Form with Radial Basis Function-------")
print ('----------------------------------------------------')
print ("M = 10 \nLambda = 0.9")
print ("E_rms Training   = " + str(float(TrainingAccuracy.split(',')[1])))
print ("E_rms Validation = " + str(float(ValidationAccuracy.split(',')[1])))
print ("E_rms Testing    = " + str(float(TestAccuracy.split(',')[1])))

UBITname      = XXXXXXXX
Person Number = YYYYYYYY
----------------------------------------------------
------------------LeToR Data------------------------
----------------------------------------------------
-------Closed Form with Radial Basis Function-------
----------------------------------------------------
M = 10 
Lambda = 0.9
E_rms Training   = 0.5561354877189318
E_rms Validation = 0.5576905260983049
E_rms Testing    = 0.553910623837981


In [403]:
np.random.seed(589)

alpha = 0.001
lam = 0.03
'''
Random initilization of weights
'''

def updateWeights(weights,phiMat,train_lab,alpha,lam):  
    midT = np.dot(np.transpose(prev_weight),phiMat)
    deltaL = -(np.subtract(train_lab,midT))
    deltaD = np.dot(deltaL,phiMat)
    deltaE = deltaD + np.dot(lam,prev_weight)

    delta = np.dot(-alpha,deltaE)
    new_weight = prev_weight + np.dot(delta,prev_weight)
    return new_weight

In [404]:
print("Output images are stored in un_sgd_10_basic_outputs folder")
train_lab = np.asarray(train_lab)
log_erms_val = []
log_erms_train = []
log_erms_test = []
prev_weight = np.random.rand(np.shape(weights)[0])
for i in range(0,400):
    print("Iteration: "+str(i))
    prev_weight = updateWeights(prev_weight,phiMat[i],train_lab[i],alpha,lam)
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(phiMat,prev_weight) 
    Erms_TR       = GetErms(TR_TEST_OUT,np.asarray(train_lab))
    log_erms_train.append(float(Erms_TR.split(',')[1]))
    print ('---------TrainingData Accuracy: ' + Erms_TR + '--------------')

    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(val_phi,prev_weight) 
    Erms_Val      = GetErms(VAL_TEST_OUT,np.asarray(val_lab))
    log_erms_val.append(float(Erms_Val.split(',')[1]))
    print ('---------ValidationData Accuracy: ' + Erms_Val + '--------------')

    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(test_phi,prev_weight) 
    Erms_Test = GetErms(TEST_OUT,np.asarray(test_lab))
    log_erms_test.append(float(Erms_Test.split(',')[1]))

    

Output images are stored in un_sgd_10_basic_outputs folder
Iteration: 0
---------TrainingData Accuracy: 0.001795396603109627,8.310911328137365--------------
---------ValidationData Accuracy: 0.0,8.309422694630255--------------
Iteration: 1
---------TrainingData Accuracy: 0.001795396603109627,6.695881353308438--------------
---------ValidationData Accuracy: 0.0,6.694442797281568--------------
Iteration: 2
---------TrainingData Accuracy: 0.001795396603109627,5.937250435607256--------------
---------ValidationData Accuracy: 0.0,5.935841976265347--------------
Iteration: 3
---------TrainingData Accuracy: 0.001795396603109627,5.238545016172257--------------
---------ValidationData Accuracy: 0.0,5.237170262241668--------------
Iteration: 4
---------TrainingData Accuracy: 0.001795396603109627,4.695878884191559--------------
---------ValidationData Accuracy: 0.0,4.694536031343548--------------
Iteration: 5
---------TrainingData Accuracy: 0.001795396603109627,4.173603661864502--------------
---

Iteration: 46
---------TrainingData Accuracy: 20.295163201551222,0.9826309877609457--------------
---------ValidationData Accuracy: 20.281528296466533,0.9824828377503587--------------
Iteration: 47
---------TrainingData Accuracy: 20.295163201551222,0.9603777720934008--------------
---------ValidationData Accuracy: 20.281528296466533,0.9602707469191898--------------
Iteration: 48
---------TrainingData Accuracy: 20.295163201551222,0.9393463082328715--------------
---------ValidationData Accuracy: 20.281528296466533,0.9392804567792011--------------
Iteration: 49
---------TrainingData Accuracy: 20.295163201551222,0.91962409686538--------------
---------ValidationData Accuracy: 20.281528296466533,0.9195990838337618--------------
Iteration: 50
---------TrainingData Accuracy: 20.295163201551222,0.9012412242366333--------------
---------ValidationData Accuracy: 20.281528296466533,0.9012563953625654--------------
Iteration: 51
---------TrainingData Accuracy: 20.295163201551222,0.883705870714638

Iteration: 91
---------TrainingData Accuracy: 20.295163201551222,0.6369196033271272--------------
---------ValidationData Accuracy: 20.281528296466533,0.6379494821003239--------------
Iteration: 92
---------TrainingData Accuracy: 20.295163201551222,0.6330902548186905--------------
---------ValidationData Accuracy: 20.281528296466533,0.6341476398365437--------------
Iteration: 93
---------TrainingData Accuracy: 20.295163201551222,0.6349888021016372--------------
---------ValidationData Accuracy: 20.281528296466533,0.6360324416757018--------------
Iteration: 94
---------TrainingData Accuracy: 20.295163201551222,0.6314000508057264--------------
---------ValidationData Accuracy: 20.281528296466533,0.632469858617784--------------
Iteration: 95
---------TrainingData Accuracy: 20.295163201551222,0.6279056285174576--------------
---------ValidationData Accuracy: 20.281528296466533,0.6290017029985848--------------
Iteration: 96
---------TrainingData Accuracy: 20.295163201551222,0.62445975025494

Iteration: 136
---------TrainingData Accuracy: 74.15885669144313,0.5798742986110335--------------
---------ValidationData Accuracy: 74.08790577420281,0.5814834872622222--------------
Iteration: 137
---------TrainingData Accuracy: 74.15885669144313,0.5790704438947233--------------
---------ValidationData Accuracy: 74.08790577420281,0.5806941735222645--------------
Iteration: 138
---------TrainingData Accuracy: 74.15885669144313,0.5783370795147659--------------
---------ValidationData Accuracy: 74.08790577420281,0.5799746247019072--------------
Iteration: 139
---------TrainingData Accuracy: 74.15885669144313,0.5790526471697779--------------
---------ValidationData Accuracy: 74.08790577420281,0.580676705557368--------------
Iteration: 140
---------TrainingData Accuracy: 74.15885669144313,0.5814088187378632--------------
---------ValidationData Accuracy: 74.08790577420281,0.5829917329575406--------------
Iteration: 141
---------TrainingData Accuracy: 74.15885669144313,0.5805127560901735---

Iteration: 181
---------TrainingData Accuracy: 74.15885669144313,0.573384182042684--------------
---------ValidationData Accuracy: 74.08790577420281,0.5751383135060022--------------
Iteration: 182
---------TrainingData Accuracy: 74.15885669144313,0.5730192934556549--------------
---------ValidationData Accuracy: 74.08790577420281,0.5747850140845551--------------
Iteration: 183
---------TrainingData Accuracy: 74.15885669144313,0.5726767148703271--------------
---------ValidationData Accuracy: 74.08790577420281,0.5744541021851226--------------
Iteration: 184
---------TrainingData Accuracy: 74.15885669144313,0.5723879612911604--------------
---------ValidationData Accuracy: 74.08790577420281,0.5741759258786323--------------
Iteration: 185
---------TrainingData Accuracy: 74.15885669144313,0.572113766616534--------------
---------ValidationData Accuracy: 74.08790577420281,0.5739125792852614--------------
Iteration: 186
---------TrainingData Accuracy: 74.15885669144313,0.5718781650525623----

Iteration: 226
---------TrainingData Accuracy: 74.15885669144313,0.5707385359936689--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726306992716529--------------
Iteration: 227
---------TrainingData Accuracy: 74.15885669144313,0.5707085005396221--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726088541642027--------------
Iteration: 228
---------TrainingData Accuracy: 74.15885669144313,0.5707505782507071--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726402858061641--------------
Iteration: 229
---------TrainingData Accuracy: 74.15885669144313,0.5707157420933047--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726136759266963--------------
Iteration: 230
---------TrainingData Accuracy: 74.15885669144313,0.5706967075632715--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726029140127233--------------
Iteration: 231
---------TrainingData Accuracy: 74.15885669144313,0.5707223838692689--

Iteration: 271
---------TrainingData Accuracy: 74.15885669144313,0.5709794631625347--------------
---------ValidationData Accuracy: 74.08790577420281,0.5728415788320879--------------
Iteration: 272
---------TrainingData Accuracy: 74.15885669144313,0.5708874186275611--------------
---------ValidationData Accuracy: 74.08790577420281,0.5727584053494079--------------
Iteration: 273
---------TrainingData Accuracy: 74.15885669144313,0.570814031976169--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726937161880435--------------
Iteration: 274
---------TrainingData Accuracy: 74.15885669144313,0.5707589594069963--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726471050785156--------------
Iteration: 275
---------TrainingData Accuracy: 74.15885669144313,0.5707207903677991--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726172601490838--------------
Iteration: 276
---------TrainingData Accuracy: 74.15885669144313,0.5706990227959376---

Iteration: 316
---------TrainingData Accuracy: 74.15885669144313,0.5708442081611624--------------
---------ValidationData Accuracy: 74.08790577420281,0.5727200699624138--------------
Iteration: 317
---------TrainingData Accuracy: 74.15885669144313,0.5707791023386459--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726638530799555--------------
Iteration: 318
---------TrainingData Accuracy: 74.15885669144313,0.5707345481683761--------------
---------ValidationData Accuracy: 74.08790577420281,0.5726275940649643--------------
Iteration: 319
---------TrainingData Accuracy: 74.15885669144313,0.570705141800678--------------
---------ValidationData Accuracy: 74.08790577420281,0.572606811171533--------------
Iteration: 320
---------TrainingData Accuracy: 74.15885669144313,0.570870418818638--------------
---------ValidationData Accuracy: 74.08790577420281,0.5727432535220074--------------
Iteration: 321
---------TrainingData Accuracy: 74.15885669144313,0.5707980042377625-----

Iteration: 361
---------TrainingData Accuracy: 74.15885669144313,0.5709999691878136--------------
---------ValidationData Accuracy: 74.08790577420281,0.5729609393640078--------------
Iteration: 362
---------TrainingData Accuracy: 74.15885669144313,0.5710970205241896--------------
---------ValidationData Accuracy: 74.08790577420281,0.5730650676543806--------------
Iteration: 363
---------TrainingData Accuracy: 74.15885669144313,0.5712031319181297--------------
---------ValidationData Accuracy: 74.08790577420281,0.573177953208187--------------
Iteration: 364
---------TrainingData Accuracy: 74.15885669144313,0.5713189465792406--------------
---------ValidationData Accuracy: 74.08790577420281,0.5733003543432528--------------
Iteration: 365
---------TrainingData Accuracy: 74.15885669144313,0.5714447540649614--------------
---------ValidationData Accuracy: 74.08790577420281,0.573432615193018--------------
Iteration: 366
---------TrainingData Accuracy: 74.15885669144313,0.5715806305531422----

KeyboardInterrupt: 

In [405]:

df = pd.DataFrame(log_erms_train)
ax = df.plot(figsize=(10,15))
ax.ticklabel_format(useOffset=False)

plt.savefig('./un_sgd_22_high_var_outputs/log_erms_train.png',bbox_inches='tight')

df = pd.DataFrame(log_erms_val)
ax = df.plot(figsize=(10,15))
ax.ticklabel_format(useOffset=False)

plt.savefig('./un_sgd_22_high_var_outputs/log_erms_val.png',bbox_inches='tight')
plt.close("all")

In [409]:
print ('----------Gradient Descent Solution--------------------')
print ("M = 10 \nLambda  = 0.001\neta=0.01")
print ("E_rms Training   = " + str(np.around(min(log_erms_train),5)))
print ("E_rms Validation = " + str(np.around(min(log_erms_val),5)))
print ("E_rms Testing    = " + str(np.around(min(log_erms_test),5)))

----------Gradient Descent Solution--------------------
M = 10 
Lambda  = 0.001
eta=0.01
E_rms Training   = 0.57069
E_rms Validation = 0.5726
E_rms Testing    = 0.56878


In [410]:
M

10

----------Gradient Descent Solution--------------------
M = 15 
Lambda  = 0.0001
eta=0.01
E_rms Training   = 0.64909
E_rms Validation = 0.6495
E_rms Testing    = 0.64725
