In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import math
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', 500)

In [2]:
master_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv")
pos_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")
neg_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")

In [3]:
def create_setting_one(master_data,pos_data):
    raw_data_temp = pd.concat([pos_data.set_index('img_id_A'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat = pd.concat([raw_data_temp.set_index('img_id_B'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat.drop(['Unnamed: 0'],axis=1,inplace=True)
    col_rename = ['img_id_B','img_id_A','target']
    for columns in range(1,len(list(raw_data_feature_concat.columns)[3:])+1):
        if(columns < 10):
            col_rename.append("fa"+str(columns))
        else:
            col_rename.append("fb"+str(columns - 9))
    raw_data_feature_concat.columns = col_rename
    col_rename.append(col_rename.pop(2))
    temp = col_rename[0]
    col_rename[0] = col_rename[1]
    col_rename[1] = temp
    raw_data_feature_concat = raw_data_feature_concat[col_rename]
    return raw_data_feature_concat

def create_setting_two(raw_data_feature_concat):
    raw_data_feature_subs = raw_data_feature_concat.copy()
    for columns in range(1,int((len(list(raw_data_feature_subs.columns))-3)/2+1)):
        raw_data_feature_subs['fm'+str(columns)] = abs(raw_data_feature_subs['fa'+str(columns)] - raw_data_feature_subs['fb'+str(columns)])
        raw_data_feature_subs.drop('fa'+str(columns),axis=1,inplace=True)
        raw_data_feature_subs.drop('fb'+str(columns),axis=1,inplace=True)
    col_swap = list(raw_data_feature_subs.columns)
    col_swap.append(col_swap.pop(2))
    raw_data_feature_subs=raw_data_feature_subs[col_swap]
    return raw_data_feature_subs

def representativeClustering(data,sizeOfTheCluster,seed):
    kmeans = KMeans(n_clusters=sizeOfTheCluster, random_state=seed)
    kmeans_data = kmeans.fit_predict(data.iloc[:,data.columns != 'target'])
    data = data.join(pd.DataFrame(kmeans_data,columns=["kmean_cluster_number"]))
    '''
    2D stratified sampling on the target value and the cluster number so that the algorithm which we will 
    implement will have fair chances of learning all types of data.
    '''
    train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["target","kmean_cluster_number"]],random_state=seed)
    val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["target","kmean_cluster_number"]],random_state=seed)
    '''
    Cluster number is not required now
    '''
    train = train.drop(["kmean_cluster_number"],axis=1)
    test = test.drop(["kmean_cluster_number"],axis=1)
    val = val.drop(["kmean_cluster_number"],axis=1)

    mu = kmeans.cluster_centers_
    return train,test,val,mu

# Linear Regression Functions Development
def covar(trainData,num_basis):
    ''' 
    Getting the covar over the training data based on number of basics we have implemented
    Changed the spread for Gaussian radial basis function
    '''
    #print("Using Uniform Gaussian radial basis function")
    train_transpose = np.transpose(trainData)
    iden = np.identity(np.shape(train_transpose)[0])
    holdResult = []
    for i in range(0,np.shape(train_transpose)[0]):
        holdRow = []
        for j in range(0,len(trainData)):
            holdRow.append(train_transpose.iloc[i,j])
        # EDIT HERE FOR PRECISION AND NON UNIFORM RADIAL BASICS
       	iden[i] = np.dot(iden[i],np.dot(np.dot(200,i),np.var(holdRow)))
    return iden

def genPhi(train,covarMat,num_basis,mu):
    '''
    Getting the Phi based on the covariance and number of basis
    '''
    phiMat = np.zeros((len(train),int(num_basis))) 
    covarMatInv = np.linalg.inv(covarMat)
    for i in range(0,num_basis):
        for j in range(0,len(train)):
            subsResult = (np.subtract(train.iloc[j,],mu[i,]))
            L = np.dot(np.transpose(subsResult),covarMatInv)
            R = np.dot(L,subsResult)
            phiMat[j][i] = math.exp(-np.dot(0.5,R))
    return phiMat

def updateWeights(weights,phiMat,train_lab,alpha,lam): 
    midT = np.dot(np.transpose(weights),phiMat)
    deltaL = -(np.subtract(train_lab,midT))
    deltaD = np.dot(float(deltaL),phiMat)
    deltaE = np.transpose(np.matrix(deltaD)) + np.dot(lam,weights)

    delta = np.dot(-alpha,deltaE)
    new_weight = weights + delta
    return new_weight

def GetValTest(VAL_PHI,W):
    Y = np.dot(np.transpose(prev_weight),np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y

def GetErms(valData,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(valData)):
        sum = sum + math.pow((ValDataAct[i] - valData[i]),2)
        if(int(np.around(valData[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(valData)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(valData))))

def plotData(log_erms_train,log_erms_val,log_erms_test):
    writePlot('log_erms_train',log_erms_train)
    writePlot('log_erms_val',log_erms_val)
    writePlot('log_erms_test',log_erms_test)
    return True

def writePlot(filename,log):
    df = pd.DataFrame(log)
    ax = df.plot(figsize=(10,15))
    ax.ticklabel_format(useOffset=False)

    plt.savefig(('./'+filename+'.png'),bbox_inches='tight')
    plt.close("all")


In [4]:
raw_data_feature_concat_pos = create_setting_one(master_data,pos_data)
raw_data_feature_concat_neg = create_setting_one(master_data,neg_data)
raw_data_feature_subs_pos = create_setting_two(raw_data_feature_concat_pos)
raw_data_feature_subs_neg = create_setting_two(raw_data_feature_concat_neg)

In [5]:
len(raw_data_feature_subs_neg) % len(raw_data_feature_subs_pos)

362

In [6]:
# Unseen Writer partitions

raw_data_feature_concat_pos[['A','A_imgNo']] = raw_data_feature_concat_pos['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
raw_data_feature_concat_pos[['B','B_imgNo']] = raw_data_feature_concat_pos['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
#raw_data_feature_concat['img_id_A'].str.extract('(?P<writerA>\d\d\d\d)(?P<imageNo>[abcd])', expand=False)
raw_data_feature_concat_neg[['A','A_imgNo']] = raw_data_feature_concat_neg['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
raw_data_feature_concat_neg[['B','B_imgNo']] = raw_data_feature_concat_neg['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)

In [7]:
raw_data = pd.concat([raw_data_feature_concat_pos,raw_data_feature_concat_neg])
#pd.DataFrame.to_csv(raw_data,'../feature_concat_data.csv')

In [8]:
data = raw_data.iloc[:,2:21]
M = 4
train,test,val,mu = representativeClustering(data=data,sizeOfTheCluster=M,seed=421)
train_lab = train.iloc[:,train.columns == 'target']
val_lab = val.iloc[:,val.columns == 'target']
test_lab = test.iloc[:,test.columns == 'target']
train = train.iloc[:,train.columns != 'target']
val = val.iloc[:,val.columns != 'target']
test = val.iloc[:,test.columns != 'target']
#print(data.head())

In [None]:
covarMat = covar(train,M)

In [None]:
#print(" Getting the covar over the training data based on number of basics we have implemented")
#covarMat = covar(train,M)
phiMat = genPhi(train,covarMat,M,mu)
valMat = genPhi(val,covarMat,M,mu)
testMat = genPhi(test,covarMat,M,mu)

In [54]:
train_lab = np.asarray(train_lab)
log_erms_val = []
log_erms_train = []
log_erms_test = []
np.random.seed(589)
prev_weight = np.matrix(np.random.rand(M,1))
alpha = 0.01
lam = 0.03

for i in range(0,len(train)):
    print("Iteration: "+str(i))
    prev_weight = updateWeights(prev_weight,phiMat[i],train_lab[i],alpha,lam)
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(phiMat,prev_weight) 
    Erms_TR       = GetErms(np.transpose(TR_TEST_OUT),np.asarray(train_lab))
    log_erms_train.append(float(Erms_TR.split(',')[1]))
    print ('---------TrainingData Accuracy: ' + Erms_TR + '--------------')

    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(valMat,prev_weight) 
    Erms_Val      = GetErms(np.transpose(VAL_TEST_OUT),np.asarray(val_lab))
    log_erms_val.append(float(Erms_Val.split(',')[1]))
    print ('---------ValidationData Accuracy: ' + Erms_Val + '--------------')
    #---------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(testMat,prev_weight) 
    Erms_Test = GetErms(np.transpose(TEST_OUT),np.asarray(test_lab))
    log_erms_test.append(float(Erms_Test.split(',')[1]))

Iteration: 0
---------TrainingData Accuracy: 0.0,3.4949406583715357--------------
---------ValidationData Accuracy: 0.0,3.4927867878967396--------------
Iteration: 1
---------TrainingData Accuracy: 0.0,3.189814283124374--------------
---------ValidationData Accuracy: 0.0,3.1878110402627873--------------
Iteration: 2
---------TrainingData Accuracy: 0.0,2.9058739226554673--------------
---------ValidationData Accuracy: 0.0,2.9040117046179805--------------
Iteration: 3
---------TrainingData Accuracy: 0.0,2.646797591862357--------------
---------ValidationData Accuracy: 0.0,2.645064050880678--------------
Iteration: 4
---------TrainingData Accuracy: 0.0,2.4107543774934097--------------
---------ValidationData Accuracy: 0.0,2.409138088829542--------------
Iteration: 5
---------TrainingData Accuracy: 0.0,2.195542112696259--------------
---------ValidationData Accuracy: 0.0,2.1940327182162056--------------
Iteration: 6
---------TrainingData Accuracy: 0.0,2.0047515982458965--------------
-----

Iteration: 54
---------TrainingData Accuracy: 100.0,0.020842377995517084--------------
---------ValidationData Accuracy: 100.0,0.020493407047863434--------------
Iteration: 55
---------TrainingData Accuracy: 100.0,0.018523465357383834--------------
---------ValidationData Accuracy: 100.0,0.018190040844524885--------------
Iteration: 56
---------TrainingData Accuracy: 100.0,0.016396446346544385--------------
---------ValidationData Accuracy: 100.0,0.016082144779928025--------------
Iteration: 57
---------TrainingData Accuracy: 100.0,0.014582285465416256--------------
---------ValidationData Accuracy: 100.0,0.01429012452654311--------------
Iteration: 58
---------TrainingData Accuracy: 100.0,0.012996855076333712--------------
---------ValidationData Accuracy: 100.0,0.012731085391969372--------------
Iteration: 59
---------TrainingData Accuracy: 100.0,0.011635602881495039--------------
---------ValidationData Accuracy: 100.0,0.01140092808568485--------------
Iteration: 60
---------Trainin

Iteration: 107
---------TrainingData Accuracy: 100.0,0.0070906956834987265--------------
---------ValidationData Accuracy: 100.0,0.007336508589776163--------------
Iteration: 108
---------TrainingData Accuracy: 100.0,0.00720816308804481--------------
---------ValidationData Accuracy: 100.0,0.007482694621942055--------------
Iteration: 109
---------TrainingData Accuracy: 100.0,0.007117586846133343--------------
---------ValidationData Accuracy: 100.0,0.007371644021433435--------------
Iteration: 110
---------TrainingData Accuracy: 100.0,0.007269396183187418--------------
---------ValidationData Accuracy: 100.0,0.007555230888557294--------------
Iteration: 111
---------TrainingData Accuracy: 100.0,0.007071250847384422--------------
---------ValidationData Accuracy: 100.0,0.0073115236117811695--------------
Iteration: 112
---------TrainingData Accuracy: 100.0,0.007256110537002226--------------
---------ValidationData Accuracy: 100.0,0.007540024776041093--------------
Iteration: 113
------

---------ValidationData Accuracy: 100.0,0.008378161770497728--------------
Iteration: 161
---------TrainingData Accuracy: 100.0,0.008082737856481989--------------
---------ValidationData Accuracy: 100.0,0.008449265476949245--------------
Iteration: 162
---------TrainingData Accuracy: 100.0,0.007672814083834863--------------
---------ValidationData Accuracy: 100.0,0.00801186987250472--------------
Iteration: 163
---------TrainingData Accuracy: 100.0,0.007842481934363071--------------
---------ValidationData Accuracy: 100.0,0.008194481250524082--------------
Iteration: 164
---------TrainingData Accuracy: 100.0,0.0080915616962878--------------
---------ValidationData Accuracy: 100.0,0.008458873562803228--------------
Iteration: 165
---------TrainingData Accuracy: 100.0,0.008124279072877475--------------
---------ValidationData Accuracy: 100.0,0.008493439535066882--------------
Iteration: 166
---------TrainingData Accuracy: 100.0,0.008220955768271709--------------
---------ValidationData A

---------TrainingData Accuracy: 100.0,0.007874672122764288--------------
---------ValidationData Accuracy: 100.0,0.00823425154565344--------------
Iteration: 215
---------TrainingData Accuracy: 100.0,0.007938178584693612--------------
---------ValidationData Accuracy: 100.0,0.00830186109119536--------------
Iteration: 216
---------TrainingData Accuracy: 100.0,0.007487606245094768--------------
---------ValidationData Accuracy: 100.0,0.007815937924517968--------------
Iteration: 217
---------TrainingData Accuracy: 100.0,0.007633676162917792--------------
---------ValidationData Accuracy: 100.0,0.007975731715553012--------------
Iteration: 218
---------TrainingData Accuracy: 100.0,0.007716442467034663--------------
---------ValidationData Accuracy: 100.0,0.008065280053667273--------------
Iteration: 219
---------TrainingData Accuracy: 100.0,0.007847949657174506--------------
---------ValidationData Accuracy: 100.0,0.008206296013747836--------------
Iteration: 220
---------TrainingData Ac

Iteration: 265
---------TrainingData Accuracy: 100.0,0.0069700365584963045--------------
---------ValidationData Accuracy: 100.0,0.0072072343966068314--------------
Iteration: 266
---------TrainingData Accuracy: 100.0,0.0069345534544352485--------------
---------ValidationData Accuracy: 100.0,0.007146981865137669--------------
Iteration: 267
---------TrainingData Accuracy: 100.0,0.006965907461146818--------------
---------ValidationData Accuracy: 100.0,0.00720157800810588--------------
Iteration: 268
---------TrainingData Accuracy: 100.0,0.0070054960305734--------------
---------ValidationData Accuracy: 100.0,0.007258130367509581--------------
Iteration: 269
---------TrainingData Accuracy: 100.0,0.0071365564348413795--------------
---------ValidationData Accuracy: 100.0,0.00742312047193084--------------
Iteration: 270
---------TrainingData Accuracy: 100.0,0.007006960528993645--------------
---------ValidationData Accuracy: 100.0,0.007260550682356727--------------
Iteration: 271
-------

---------ValidationData Accuracy: 100.0,0.008554432958570436--------------
Iteration: 316
---------TrainingData Accuracy: 100.0,0.008258789685962211--------------
---------ValidationData Accuracy: 100.0,0.008648815644062977--------------
Iteration: 317
---------TrainingData Accuracy: 100.0,0.007689552948656165--------------
---------ValidationData Accuracy: 100.0,0.0080470137755907--------------
Iteration: 318
---------TrainingData Accuracy: 100.0,0.007891322576147433--------------
---------ValidationData Accuracy: 100.0,0.008262535500788938--------------
Iteration: 319
---------TrainingData Accuracy: 100.0,0.007945973732428535--------------
---------ValidationData Accuracy: 100.0,0.008320525416468508--------------
Iteration: 320
---------TrainingData Accuracy: 100.0,0.008194038461235518--------------
---------ValidationData Accuracy: 100.0,0.008581509517468722--------------
Iteration: 321
---------TrainingData Accuracy: 100.0,0.008186583851120047--------------
---------ValidationData 

---------ValidationData Accuracy: 100.0,0.007099477188491574--------------
Iteration: 367
---------TrainingData Accuracy: 100.0,0.006922739079142074--------------
---------ValidationData Accuracy: 100.0,0.007164124533242608--------------
Iteration: 368
---------TrainingData Accuracy: 100.0,0.006933534517323787--------------
---------ValidationData Accuracy: 100.0,0.007180667658507611--------------
Iteration: 369
---------TrainingData Accuracy: 100.0,0.007013459934280122--------------
---------ValidationData Accuracy: 100.0,0.007288548980511126--------------
Iteration: 370
---------TrainingData Accuracy: 100.0,0.007132026642723719--------------
---------ValidationData Accuracy: 100.0,0.00743303740543029--------------
Iteration: 371
---------TrainingData Accuracy: 100.0,0.0072411395067892605--------------
---------ValidationData Accuracy: 100.0,0.007559484751964275--------------
Iteration: 372
---------TrainingData Accuracy: 100.0,0.007292341373427175--------------
---------ValidationDat

---------TrainingData Accuracy: 100.0,0.008574988663928305--------------
---------ValidationData Accuracy: 100.0,0.008985771938277638--------------
Iteration: 418
---------TrainingData Accuracy: 100.0,0.008056309299259363--------------
---------ValidationData Accuracy: 100.0,0.00844609244738613--------------
Iteration: 419
---------TrainingData Accuracy: 100.0,0.008225606652053023--------------
---------ValidationData Accuracy: 100.0,0.008623349806019184--------------
Iteration: 420
---------TrainingData Accuracy: 100.0,0.008370419175967024--------------
---------ValidationData Accuracy: 100.0,0.008774157099138466--------------
Iteration: 421
---------TrainingData Accuracy: 100.0,0.007565567332205827--------------
---------ValidationData Accuracy: 100.0,0.00792420328170513--------------
Iteration: 422
---------TrainingData Accuracy: 100.0,0.007305105654808124--------------
---------ValidationData Accuracy: 100.0,0.007638060601564419--------------
Iteration: 423
---------TrainingData Ac

---------ValidationData Accuracy: 100.0,0.008071197349121002--------------
Iteration: 519
---------TrainingData Accuracy: 100.0,0.007420194690759046--------------
---------ValidationData Accuracy: 100.0,0.007776313133119058--------------
Iteration: 520
---------TrainingData Accuracy: 100.0,0.00703705705737956--------------
---------ValidationData Accuracy: 100.0,0.007341863418662295--------------
Iteration: 521
---------TrainingData Accuracy: 100.0,0.007095097139606682--------------
---------ValidationData Accuracy: 100.0,0.007410740813346602--------------
Iteration: 522
---------TrainingData Accuracy: 100.0,0.0072122950287348865--------------
---------ValidationData Accuracy: 100.0,0.0075455803915578335--------------
Iteration: 523
---------TrainingData Accuracy: 100.0,0.007216021452604112--------------
---------ValidationData Accuracy: 100.0,0.007549916324963094--------------
Iteration: 524
---------TrainingData Accuracy: 100.0,0.0073207655806823485--------------
---------ValidationD

---------TrainingData Accuracy: 100.0,0.007311249734763848--------------
---------ValidationData Accuracy: 100.0,0.007661714079533345--------------
Iteration: 570
---------TrainingData Accuracy: 100.0,0.0074802887637793435--------------
---------ValidationData Accuracy: 100.0,0.007847074155087052--------------
Iteration: 571
---------TrainingData Accuracy: 100.0,0.007660736738126555--------------
---------ValidationData Accuracy: 100.0,0.008041443958905568--------------
Iteration: 572
---------TrainingData Accuracy: 100.0,0.007798351982898016--------------
---------ValidationData Accuracy: 100.0,0.008187973913118974--------------
Iteration: 573
---------TrainingData Accuracy: 100.0,0.008075606282747984--------------
---------ValidationData Accuracy: 100.0,0.008479850287713187--------------
Iteration: 574
---------TrainingData Accuracy: 100.0,0.007451843523754756--------------
---------ValidationData Accuracy: 100.0,0.00781656286174244--------------
Iteration: 575
---------TrainingData 

In [55]:
plotData(log_erms_train,log_erms_val,log_erms_test)

True