In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', 500)

In [3]:
hum_obs_master_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv")
gsc_master_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/GSC-Features.csv")
hum_obs_pos_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")
gsc_pos_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/same_pairs.csv")
hum_obs_neg_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")
gsc_neg_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/diffn_pairs.csv")

In [4]:
def create_setting_one(master_data,pos_data):
    raw_data_temp = pd.concat([pos_data.set_index('img_id_A'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat = pd.concat([raw_data_temp.set_index('img_id_B'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat.drop(['Unnamed: 0'],axis=1,inplace=True)
    col_rename = ['img_id_B','img_id_A','target']
    for columns in range(1,len(list(raw_data_feature_concat.columns)[3:])+1):
        if(columns < 10):
            col_rename.append("fa"+str(columns))
        else:
            col_rename.append("fb"+str(columns - 9))
    raw_data_feature_concat.columns = col_rename
    col_rename.append(col_rename.pop(2))
    temp = col_rename[0]
    col_rename[0] = col_rename[1]
    col_rename[1] = temp
    raw_data_feature_concat = raw_data_feature_concat[col_rename]
    return raw_data_feature_concat

def create_setting_two(raw_data_feature_concat):
    raw_data_feature_subs = raw_data_feature_concat.copy()
    for columns in range(1,int((len(list(raw_data_feature_subs.columns))-3)/2+1)):
        raw_data_feature_subs['fm'+str(columns)] = abs(raw_data_feature_subs['fa'+str(columns)] - raw_data_feature_subs['fb'+str(columns)])
        raw_data_feature_subs.drop('fa'+str(columns),axis=1,inplace=True)
        raw_data_feature_subs.drop('fb'+str(columns),axis=1,inplace=True)
    col_swap = list(raw_data_feature_subs.columns)
    col_swap.append(col_swap.pop(2))
    raw_data_feature_subs=raw_data_feature_subs[col_swap]
    return raw_data_feature_subs

def representativeClustering(data,sizeOfTheCluster,seed):
    kmeans = KMeans(n_clusters=sizeOfTheCluster, random_state=seed)
    kmeans_data = kmeans.fit_predict(data.iloc[:,data.columns != 'target'])
    data = data.join(pd.DataFrame(kmeans_data,columns=["kmean_cluster_number"]))
    '''
    2D stratified sampling on the target value and the cluster number so that the algorithm which we will 
    implement will have fair chances of learning all types of data.
    '''
    train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["target","kmean_cluster_number"]],random_state=seed)
    val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["target","kmean_cluster_number"]],random_state=seed)
    '''
    Cluster number is not required now
    '''
    train = train.drop(["kmean_cluster_number"],axis=1)
    test = test.drop(["kmean_cluster_number"],axis=1)
    val = val.drop(["kmean_cluster_number"],axis=1)

    mu = kmeans.cluster_centers_
    return train,test,val,mu

# Linear Regression Functions Development
def covar(trainData,num_basis):
    ''' 
    Getting the covar over the training data based on number of basics we have implemented
    Changed the spread for Gaussian radial basis function
    '''
    #print("Using Uniform Gaussian radial basis function")
    train_transpose = np.transpose(trainData)
    iden = np.identity(np.shape(train_transpose)[0])
    holdResult = []
    for i in range(0,np.shape(train_transpose)[0]):
        holdRow = []
        for j in range(0,len(trainData)):
            holdRow.append(train_transpose.iloc[i,j])
        # EDIT HERE FOR PRECISION AND NON UNIFORM RADIAL BASICS
       	iden[i] = np.dot(iden[i],np.dot(np.dot(200,i),np.var(holdRow)))
    return iden

def genPhi(train,covarMat,num_basis,mu):
    '''
    Getting the Phi based on the covariance and number of basis
    '''
    phiMat = np.zeros((len(train),int(num_basis))) 
    covarMatInv = np.linalg.pinv(covarMat)
    for i in range(0,num_basis):
        for j in range(0,len(train)):
            subsResult = (np.subtract(train.iloc[j,],mu[i,]))
            L = np.dot(np.transpose(subsResult),covarMatInv)
            R = np.dot(L,subsResult)
            phiMat[j][i] = math.exp(-np.dot(0.5,R))
    return phiMat

def updateWeights(weights,phiMat,train_lab,alpha,lam): 
    midT = np.dot(np.transpose(weights),phiMat)
    deltaL = -(np.subtract(train_lab,midT))
    deltaD = np.dot(float(deltaL),phiMat)
    deltaE = np.transpose(np.matrix(deltaD)) + np.dot(lam,weights)

    delta = np.dot(-alpha,deltaE)
    new_weight = weights + delta
    return new_weight

def GetValTest(VAL_PHI,W):
    Y = np.dot(np.transpose(prev_weight),np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y

def GetErms(valData,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(valData)):
        sum = sum + math.pow((ValDataAct[i] - valData[i]),2)
        if(int(np.around(valData[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(valData)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(valData))))

def plotData(log_erms_train,log_erms_val,log_erms_test):
    writePlot('log_erms_train',log_erms_train)
    writePlot('log_erms_val',log_erms_val)
    writePlot('log_erms_test',log_erms_test)
    return True

def writePlot(filename,log):
    df = pd.DataFrame(log)
    ax = df.plot(figsize=(10,15))
    ax.ticklabel_format(useOffset=False)

    plt.savefig(('./'+filename+'.png'),bbox_inches='tight')
    plt.close("all")


In [None]:
hum_obs_feature_concat_pos = create_setting_one(hum_obs_master_data,hum_obs_pos_data)
hum_obs_feature_concat_neg = create_setting_one(hum_obs_master_data,hum_obs_neg_data)
hum_obs_feature_subs_pos = create_setting_two(hum_obs_feature_concat_pos)
hum_obs_feature_subs_neg = create_setting_two(hum_obs_feature_concat_neg)

In [28]:
gsc_feature_concat_pos = create_setting_one(master_data,pos_data)
gsc_eature_concat_neg = create_setting_one(master_data,neg_data)
gsc_feature_subs_pos = create_setting_two(raw_data_feature_concat_pos)
gsc_feature_subs_neg = create_setting_two(raw_data_feature_concat_neg)

In [29]:
while(len(raw_data_feature_concat_pos) < len(raw_data_feature_concat_neg)):
    raw_data_feature_concat_pos = raw_data_feature_concat_pos.append(raw_data_feature_concat_pos,ignore_index=True)

In [6]:
# Using two times the data to fit all the clusters
#raw_data_feature_concat_pos = pd.concat([raw_data_feature_concat_pos,raw_data_feature_concat_pos,raw_data_feature_concat_pos,raw_data_feature_concat_pos,raw_data_feature_concat_pos],ignore_index=True)
#raw_data_feature_concat_neg = raw_data_feature_concat_neg.sample(n=len(raw_data_feature_concat_pos))

In [30]:
# Unseen Writer partitions

raw_data_feature_concat_pos[['A','A_imgNo']] = raw_data_feature_concat_pos['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
raw_data_feature_concat_pos[['B','B_imgNo']] = raw_data_feature_concat_pos['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
#raw_data_feature_concat['img_id_A'].str.extract('(?P<writerA>\d\d\d\d)(?P<imageNo>[abcd])', expand=False)
raw_data_feature_concat_neg[['A','A_imgNo']] = raw_data_feature_concat_neg['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
raw_data_feature_concat_neg[['B','B_imgNo']] = raw_data_feature_concat_neg['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)

In [32]:
raw_data = pd.concat([raw_data_feature_concat_pos,raw_data_feature_concat_neg],ignore_index=True)
pd.DataFrame.to_csv(raw_data,'../feature_concat_data.csv')

In [89]:
data

Unnamed: 0,fa1,fa2,fa3,fa4,fa5,fa6,fa7,fa8,fa9,fb1,fb2,fb3,fb4,fb5,fb6,fb7,fb8,fb9,target
0,2,1,1,0,2,2,0,2,2,3,2,1,0,2,2,3,0,2,1
1,2,1,1,0,2,2,0,1,2,2,1,0,3,2,2,1,2,2,1
2,2,1,1,0,2,2,0,1,2,1,1,1,1,2,3,0,0,2,1
3,2,1,1,3,2,2,0,2,2,1,1,1,0,2,2,0,2,2,1
4,2,1,1,3,2,2,0,2,2,2,1,1,0,2,2,0,0,2,1
5,1,1,1,0,2,2,0,2,2,2,1,1,0,2,2,0,0,2,1
6,2,1,1,3,2,2,0,1,2,2,1,1,0,2,2,0,3,2,1
7,2,1,1,3,2,2,0,1,2,1,1,1,0,2,2,0,1,2,1
8,2,1,1,0,2,2,0,3,2,1,1,1,0,2,2,0,1,2,1
9,1,1,1,3,2,2,0,2,2,2,1,0,0,2,2,1,2,2,1


In [33]:
data = raw_data.iloc[:,2:21]
M = 9
train,test,val,mu = representativeClustering(data=data,sizeOfTheCluster=M,seed=421)
train_lab = train.iloc[:,train.columns == 'target']
val_lab = val.iloc[:,val.columns == 'target']
test_lab = test.iloc[:,test.columns == 'target']
train = train.iloc[:,train.columns != 'target']
val = val.iloc[:,val.columns != 'target']
test = val.iloc[:,test.columns != 'target']
#print(data.head())

In [34]:
covarMat = covar(train,M)

In [35]:
#print(" Getting the covar over the training data based on number of basics we have implemented")
#covarMat = covar(train,M)
phiMat = genPhi(train,covarMat,M,mu)
valMat = genPhi(val,covarMat,M,mu)
testMat = genPhi(test,covarMat,M,mu)

In [None]:
train_lab = np.asarray(train_lab)
log_erms_val = []
log_erms_train = []
log_erms_test = []
np.random.seed(589)
prev_weight = np.matrix(np.random.rand(M,1))
alpha = 0.1
lam = 0.005

for i in range(0,len(train)):
    print("Iteration: "+str(i))
    prev_weight = updateWeights(prev_weight,phiMat[i],train_lab[i],alpha,lam)
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(phiMat,prev_weight) 
    Erms_TR       = GetErms(np.transpose(TR_TEST_OUT),np.asarray(train_lab))
    log_erms_train.append(float(Erms_TR.split(',')[1]))
    print ('---------TrainingData Accuracy: ' + Erms_TR + '--------------')

    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(valMat,prev_weight) 
    Erms_Val      = GetErms(np.transpose(VAL_TEST_OUT),np.asarray(val_lab))
    log_erms_val.append(float(Erms_Val.split(',')[1]))
    print ('---------ValidationData Accuracy: ' + Erms_Val + '--------------')
    #---------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(testMat,prev_weight) 
    Erms_Test = GetErms(np.transpose(TEST_OUT),np.asarray(test_lab))
    log_erms_test.append(float(Erms_Test.split(',')[1]))

Iteration: 0
---------TrainingData Accuracy: 58.01951581160383,0.9861234466531353--------------
---------ValidationData Accuracy: 58.01839488839861,0.9861281744805247--------------
Iteration: 1
---------TrainingData Accuracy: 58.01951581160383,0.6789712093198832--------------
---------ValidationData Accuracy: 58.01839488839861,0.6789672977582796--------------
Iteration: 2
---------TrainingData Accuracy: 58.01951581160383,0.6512039280239826--------------
---------ValidationData Accuracy: 58.01839488839861,0.6511990872194819--------------
Iteration: 3
---------TrainingData Accuracy: 58.01951581160383,0.6514652257724596--------------
---------ValidationData Accuracy: 58.01839488839861,0.6514603940202799--------------
Iteration: 4
---------TrainingData Accuracy: 41.98048418839617,0.6731187053154495--------------
---------ValidationData Accuracy: 41.98160511160139,0.673117848007804--------------
Iteration: 5
---------TrainingData Accuracy: 41.98048418839617,0.7495966028275903--------------


---------ValidationData Accuracy: 41.98160511160139,0.6743468980158327--------------
Iteration: 46
---------TrainingData Accuracy: 41.98048418839617,0.7484049093152829--------------
---------ValidationData Accuracy: 41.98160511160139,0.7484064998179257--------------
Iteration: 47
---------TrainingData Accuracy: 58.01951581160383,0.5790330291724257--------------
---------ValidationData Accuracy: 58.01839488839861,0.5790255826587728--------------
Iteration: 48
---------TrainingData Accuracy: 58.01951581160383,0.6398896033621961--------------
---------ValidationData Accuracy: 58.01839488839861,0.6398843788071765--------------
Iteration: 49
---------TrainingData Accuracy: 41.98048418839617,0.6593403576736334--------------
---------ValidationData Accuracy: 41.98160511160139,0.6593386650981499--------------
Iteration: 50
---------TrainingData Accuracy: 58.01951581160383,0.5880441724025195--------------
---------ValidationData Accuracy: 58.01839488839861,0.588037062647896--------------
Iterat

In [None]:
plotData(log_erms_train,log_erms_val,log_erms_test)

In [24]:
confusion_matrix(val_lab.iloc[:,0],np.array(np.round(VAL_TEST_OUT.reshape(np.shape(VAL_TEST_OUT)[1],1))))

array([[   58, 29245],
       [  171, 40328]])

In [27]:
y_true = pd.Series(np.array(val_lab.iloc[:,0]))
y_pred = pd.Series(np.array((np.around(VAL_TEST_OUT, 0))).ravel())

pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,1.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29303,29303
1,40499,40499
All,69802,69802


Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29303,29303
1,40499,40499
All,69802,69802
