In [290]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd 
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [291]:
'''
Reading the data set which has both target data and the features, and then removing the synthetic
data features
''' 
data = pd.read_csv("combined.csv")
data = data.drop(["X_6","X_7","X_8","X_9","X_10"],axis=1)

In [292]:
'''
First clustering the data and predicting the class labels for each of the instance
'''
M = 10
kmeans = KMeans(n_clusters=M, random_state=421)
kmeans_data = kmeans.fit_predict(data.iloc[:,1:])
data = data.join(pd.DataFrame(kmeans_data,columns=["kmean_cluster_number"]))

'''
2D stratified sampling on the target value and the cluster number so that the algorithm which we will 
implement will have fair chances of learning all types of data.
'''
train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["Target","kmean_cluster_number"]],random_state=42)
val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["Target","kmean_cluster_number"]],random_state=24)

'''
Cluster number is not required now
'''
train = train.drop(["kmean_cluster_number"],axis=1)
test = test.drop(["kmean_cluster_number"],axis=1)
val = val.drop(["kmean_cluster_number"],axis=1)

mu = kmeans.cluster_centers_

In [293]:
'''
Splitting the labels form the data to train the data
'''
train_lab = train.iloc[:,0]
trainData = train.iloc[:,1:]
test_lab = test.iloc[:,0]
testData = test.iloc[:,1:]
val_lab = val.iloc[:,0]
valData = val.iloc[:,1:]


In [294]:
num_basis = len(mu)

def covar(trainData,num_basis):
    ''' 
    Getting the covar over the training data based on number of basics we have implemented
    Changed the spread for Gaussian radial basis function
    '''
    train_transpose = np.transpose(trainData)
    iden = np.identity(np.shape(train_transpose)[0])
    holdResult = []
    for i in range(0,np.shape(train_transpose)[0]):
        holdRow = []
        for j in range(0,len(trainData)):
            holdRow.append(train_transpose.iloc[i,j])
        iden[i] = np.dot(iden[i],np.dot(np.dot(0.1,i),np.var(holdRow)))
        #iden[i] = np.dot(iden[i],np.var(holdRow))
    return iden

covarMat = covar(trainData,num_basis)

In [295]:
def genPhi(train,covarMat,num_basis):
    '''
    Getting the Phi based on the covariance and number of basis
    '''
    phiMat = np.zeros((len(train),int(num_basis))) 
    covarMatInv = np.linalg.pinv(covarMat)
    for i in range(0,num_basis):
        for j in range(0,len(train)):
            subsResult = (np.subtract(train.iloc[j,],mu[i,]))
            L = np.dot(np.transpose(subsResult),covarMatInv)
            R = np.dot(L,subsResult)
            phiMat[j][i] = math.exp(-np.dot(0.5,R))
    return phiMat


In [296]:
print("Computing Phi will take lot of time. Please wait...")

Computing Phi will take lot of time. Please wait...


In [297]:
# Gen Phi Data
phiMat = genPhi(trainData,covarMat,num_basis)
test_phi = genPhi(testData,covarMat,num_basis)
val_phi = genPhi(valData,covarMat,num_basis)

In [298]:
print("Calculated Phi Matrix")

Calculated Phi Matrix


In [299]:
lam =0.9

def getWeights(train_lab,phiMat,lam):
    '''
    Getting the weights based on the Lambda and phi matrix
    '''
    iden = np.identity(len(phiMat[0]))
    ft = np.dot(iden,lam)
    st = np.dot(np.transpose(phiMat),phiMat)
    fmt = np.add(ft,st)
    fmt = np.linalg.pinv(fmt)
    smt = np.dot(np.transpose(phiMat),np.asarray(train_lab))
    wReg = np.dot(fmt,smt)
    return wReg

weights = getWeights(train_lab,phiMat,lam)

In [300]:
# Using same functions as the TA code had to check accuracy
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y



In [301]:
TR_TEST_OUT  = GetValTest(phiMat,weights)
VAL_TEST_OUT = GetValTest(val_phi,weights)
TEST_OUT     = GetValTest(test_phi,weights)

TrainingAccuracy   = str(GetErms(TR_TEST_OUT,np.asarray(train_lab)))
ValidationAccuracy = str(GetErms(VAL_TEST_OUT,np.asarray(val_lab)))
TestAccuracy       = str(GetErms(TEST_OUT,np.asarray(test_lab)))

In [302]:
print ('UBITname      = XXXXXXXX')
print ('Person Number = YYYYYYYY')
print ('----------------------------------------------------')
print ("------------------LeToR Data------------------------")
print ('----------------------------------------------------')
print ("-------Closed Form with Radial Basis Function-------")
print ('----------------------------------------------------')
print ("M = 10 \nLambda = 0.9")
print ("E_rms Training   = " + str(float(TrainingAccuracy.split(',')[1])))
print ("E_rms Validation = " + str(float(ValidationAccuracy.split(',')[1])))
print ("E_rms Testing    = " + str(float(TestAccuracy.split(',')[1])))

UBITname      = XXXXXXXX
Person Number = YYYYYYYY
----------------------------------------------------
------------------LeToR Data------------------------
----------------------------------------------------
-------Closed Form with Radial Basis Function-------
----------------------------------------------------
M = 10 
Lambda = 0.9
E_rms Training   = 0.6367624141821467
E_rms Validation = 0.6381572115772233
E_rms Testing    = 0.6332651852997802


In [303]:
np.random.seed(1741)

alpha = 0.01
lam = 0.0001
'''
Random initilization of weights
'''

def updateWeights(weights,phiMat,train_lab,alpha,lam):  
    midT = np.dot(np.transpose(prev_weight),phiMat)
    deltaL = -(np.subtract(train_lab,midT))
    deltaD = np.dot(deltaL,phiMat)
    deltaE = deltaD + np.dot(lam,prev_weight)

    delta = np.dot(-alpha,deltaE)
    new_weight = prev_weight + np.dot(delta,prev_weight)
    return new_weight

In [304]:
print("Output images are stored in un_sgd_10_basic_outputs folder")
train_lab = np.asarray(train_lab)
log_erms_val = []
log_erms_train = []
log_erms_test = []
prev_weight = np.random.rand(np.shape(weights)[0])
for i in range(0,150):
    print("Iteration: "+str(i))
    prev_weight = updateWeights(prev_weight,phiMat[i],train_lab[i],alpha,lam)
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(phiMat,prev_weight) 
    Erms_TR       = GetErms(TR_TEST_OUT,np.asarray(train_lab))
    log_erms_train.append(float(Erms_TR.split(',')[1]))
    print ('---------TrainingData Accuracy: ' + Erms_TR + '--------------')

    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(val_phi,prev_weight) 
    Erms_Val      = GetErms(VAL_TEST_OUT,np.asarray(val_lab))
    log_erms_val.append(float(Erms_Val.split(',')[1]))
    print ('---------ValidationData Accuracy: ' + Erms_Val + '--------------')

    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(test_phi,prev_weight) 
    Erms_Test = GetErms(TEST_OUT,np.asarray(test_lab))
    log_erms_test.append(float(Erms_Test.split(',')[1]))

    

Output images are stored in cl_sgd_10_basic_outputs folder
Iteration: 0
---------TrainingData Accuracy: 74.15526589823692,0.6454470081978807--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460655719009716--------------
Iteration: 1
---------TrainingData Accuracy: 74.15526589823692,0.6454474907468907--------------
---------ValidationData Accuracy: 74.14536052858374,0.646066069599656--------------
Iteration: 2
---------TrainingData Accuracy: 74.15526589823692,0.6454479444053669--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460665375002164--------------
Iteration: 3
---------TrainingData Accuracy: 74.15526589823692,0.6454484268786878--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460670351196102--------------
Iteration: 4
---------TrainingData Accuracy: 74.15526589823692,0.645448909313147--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460675326982926--------------
Iteration: 5
---------TrainingData Acc

Iteration: 45
---------TrainingData Accuracy: 74.15526589823692,0.6454282363469936--------------
---------ValidationData Accuracy: 74.14536052858374,0.64604621021451--------------
Iteration: 46
---------TrainingData Accuracy: 74.15526589823692,0.645424799607241--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460426653840439--------------
Iteration: 47
---------TrainingData Accuracy: 74.15526589823692,0.645421194604785--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460389469641111--------------
Iteration: 48
---------TrainingData Accuracy: 74.15526589823692,0.6454216775428865--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460394450980506--------------
Iteration: 49
---------TrainingData Accuracy: 74.15526589823692,0.6454221705391845--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460399536060266--------------
Iteration: 50
---------TrainingData Accuracy: 74.15526589823692,0.6454226549964375------------

---------TrainingData Accuracy: 74.15526589823692,0.6454236862619127--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460415170153956--------------
Iteration: 91
---------TrainingData Accuracy: 74.15526589823692,0.6454241706007661--------------
---------ValidationData Accuracy: 74.14536052858374,0.646042016590867--------------
Iteration: 92
---------TrainingData Accuracy: 74.15526589823692,0.6454246549023162--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460425161272223--------------
Iteration: 93
---------TrainingData Accuracy: 74.15526589823692,0.6454251391665843--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460430156244866--------------
Iteration: 94
---------TrainingData Accuracy: 74.15526589823692,0.6454256233935477--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460435150826302--------------
Iteration: 95
---------TrainingData Accuracy: 74.15526589823692,0.6454261169265813--------------
--------

Iteration: 135
---------TrainingData Accuracy: 74.15526589823692,0.6454093430893815--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460267223361588--------------
Iteration: 136
---------TrainingData Accuracy: 74.15526589823692,0.645409829460971--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460272240273924--------------
Iteration: 137
---------TrainingData Accuracy: 74.15526589823692,0.6454106077706057--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460280268507312--------------
Iteration: 138
---------TrainingData Accuracy: 74.15526589823692,0.6454110931169919--------------
---------ValidationData Accuracy: 74.14536052858374,0.646028527482788--------------
Iteration: 139
---------TrainingData Accuracy: 74.15526589823692,0.6454115891610064--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460290391487262--------------
Iteration: 140
---------TrainingData Accuracy: 74.15526589823692,0.6454121451264336----

Iteration: 180
---------TrainingData Accuracy: 74.15526589823692,0.6454320181866973--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460501109616216--------------
Iteration: 181
---------TrainingData Accuracy: 74.15526589823692,0.6454325018839733--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460506098643465--------------
Iteration: 182
---------TrainingData Accuracy: 74.15526589823692,0.6454329855579964--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460511087424465--------------
Iteration: 183
---------TrainingData Accuracy: 74.15526589823692,0.645433469180804--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460516075670785--------------
Iteration: 184
---------TrainingData Accuracy: 74.15526589823692,0.6454339472195542--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460521006315076--------------
Iteration: 185
---------TrainingData Accuracy: 74.15526589823692,0.6454344307683617---

Iteration: 225
---------TrainingData Accuracy: 74.15526589823692,0.6454522651982575--------------
---------ValidationData Accuracy: 74.14536052858374,0.646070993911365--------------
Iteration: 226
---------TrainingData Accuracy: 74.15526589823692,0.645452787223674--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460715323183512--------------
Iteration: 227
---------TrainingData Accuracy: 74.15526589823692,0.6454532693227123--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460720295453817--------------
Iteration: 228
---------TrainingData Accuracy: 74.15526589823692,0.6454537513846671--------------
---------ValidationData Accuracy: 74.14536052858374,0.646072526733536--------------
Iteration: 229
---------TrainingData Accuracy: 74.15526589823692,0.6454470691062211--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460656347215734--------------
Iteration: 230
---------TrainingData Accuracy: 74.15526589823692,0.6454475517105875-----

Iteration: 270
---------TrainingData Accuracy: 74.15526589823692,0.6454504184633563--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460690892185356--------------
Iteration: 271
---------TrainingData Accuracy: 74.15526589823692,0.6454512785896256--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460699763401594--------------
Iteration: 272
---------TrainingData Accuracy: 74.15526589823692,0.6454517608047091--------------
---------ValidationData Accuracy: 74.14536052858374,0.646070473688849--------------
Iteration: 273
---------TrainingData Accuracy: 74.15526589823692,0.6454522429827005--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460709709986504--------------
Iteration: 274
---------TrainingData Accuracy: 74.15526589823692,0.645452725123598--------------
---------ValidationData Accuracy: 74.14536052858374,0.646071468269568--------------
Iteration: 275
---------TrainingData Accuracy: 74.15526589823692,0.6454532072275656-----

Iteration: 315
---------TrainingData Accuracy: 74.15526589823692,0.6454705954464237--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460898989352621--------------
Iteration: 316
---------TrainingData Accuracy: 74.15526589823692,0.6454710879170358--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460904068355862--------------
Iteration: 317
---------TrainingData Accuracy: 74.15526589823692,0.6454715686090008--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460909025875964--------------
Iteration: 318
---------TrainingData Accuracy: 74.15526589823692,0.6454737071806179--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460931081529382--------------
Iteration: 319
---------TrainingData Accuracy: 74.15526589823692,0.6454741876726116--------------
---------ValidationData Accuracy: 74.14536052858374,0.6460936036953148--------------
Iteration: 320
---------TrainingData Accuracy: 74.15526589823692,0.6454746681263674--

Iteration: 360
---------TrainingData Accuracy: 74.15526589823692,0.6454881065676664--------------
---------ValidationData Accuracy: 74.14536052858374,0.6461079582999111--------------
Iteration: 361
---------TrainingData Accuracy: 74.15526589823692,0.645488845431101--------------
---------ValidationData Accuracy: 74.14536052858374,0.6461087202777585--------------
Iteration: 362
---------TrainingData Accuracy: 74.15526589823692,0.6454897556282864--------------
---------ValidationData Accuracy: 74.14536052858374,0.6461096589472956--------------
Iteration: 363
---------TrainingData Accuracy: 74.15526589823692,0.6454902348865942--------------
---------ValidationData Accuracy: 74.14536052858374,0.6461101531966325--------------
Iteration: 364
---------TrainingData Accuracy: 74.15526589823692,0.6454907141076879--------------
---------ValidationData Accuracy: 74.14536052858374,0.646110647406984--------------
Iteration: 365
---------TrainingData Accuracy: 74.15526589823692,0.6454902045036486----

In [305]:

df = pd.DataFrame(log_erms_train)
ax = df.plot(figsize=(10,15))
ax.ticklabel_format(useOffset=False)

plt.savefig('./un_sgd_10_basic_outputs/log_erms_train.png',bbox_inches='tight')

df = pd.DataFrame(log_erms_val)
ax = df.plot(figsize=(10,15))
ax.ticklabel_format(useOffset=False)

plt.savefig('./un_sgd_10_basic_outputs/log_erms_val.png',bbox_inches='tight')
plt.close("all")

In [306]:
print ('----------Gradient Descent Solution--------------------')
print ("M = 10 \nLambda  = 0.001\neta=0.01")
print ("E_rms Training   = " + str(np.around(min(log_erms_train),5)))
print ("E_rms Validation = " + str(np.around(min(log_erms_val),5)))
print ("E_rms Testing    = " + str(np.around(min(log_erms_test),5)))

----------Gradient Descent Solution--------------------
M = 10 
Lambda  = 0.001
eta=0.01
E_rms Training   = 0.64541
E_rms Validation = 0.64603
E_rms Testing    = 0.64327


----------Gradient Descent Solution--------------------
M = 15 
Lambda  = 0.0001
eta=0.01
E_rms Training   = 0.64909
E_rms Validation = 0.6495
E_rms Testing    = 0.64725
