In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
from matplotlib import pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)

In [6]:
'''
Logistic Regression README:
Modes:
1. Human Observed Dataset
2. GSC
Feature Type:
a. Feature Concat
b. Feature Subs
'''
mode = 1
subMode = 'a'

In [65]:
# GSC or HOD??
if(mode == 1):
    hum_obs_master_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv")
    hum_obs_pos_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")
    hum_obs_neg_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")
elif(mode ==2):
    gsc_master_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/GSC-Features.csv")
    gsc_pos_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/same_pairs.csv")
    gsc_neg_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/diffn_pairs.csv")

In [8]:
def create_setting_one(master_data,pos_data):
    raw_data_temp = pd.concat([pos_data.set_index('img_id_A'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat = pd.concat([raw_data_temp.set_index('img_id_B'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    if(np.shape(raw_data_feature_concat)[1] < 25):
        raw_data_feature_concat.drop(['Unnamed: 0'],axis=1,inplace=True)
        num_features = 9 + 1
    else:
        num_features = 512+1
    col_rename = ['img_id_B','img_id_A','target']
    for columns in range(1,len(list(raw_data_feature_concat.columns)[3:])+1):
        if(columns < num_features):
            col_rename.append("fa"+str(columns))
        else:
            col_rename.append("fb"+str(columns - num_features+1))
    raw_data_feature_concat.columns = col_rename
    col_rename.append(col_rename.pop(2))
    temp = col_rename[0]
    col_rename[0] = col_rename[1]
    col_rename[1] = temp
    raw_data_feature_concat = raw_data_feature_concat[col_rename]
    return raw_data_feature_concat

def create_setting_two(raw_data_feature_concat):
    raw_data_feature_subs = pd.concat([raw_data_feature_concat.iloc[:,0:2],raw_data_feature_concat.iloc[:,-1]],axis=1,join='inner').reset_index()
    for columns in range(1,int((len(list(raw_data_feature_concat.columns))-3)/2+1)):
        raw_data_feature_subs['fm'+str(columns)] = abs(raw_data_feature_concat['fa'+str(columns)] - raw_data_feature_concat['fb'+str(columns)])
    col_swap = list(raw_data_feature_subs.columns)[1:]
    col_swap.append(col_swap.pop(2))
    raw_data_feature_subs=raw_data_feature_subs[col_swap]
    return raw_data_feature_subs

def stratifiedSampling(data,seed):
    train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["target"]],random_state=seed)
    val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["target"]],random_state=seed)
    return train,test,val

def grad_descent(train,target,theta,learningRate):
    prediction = sigmoid_pred(theta,train)
    error = j_theta(theta,prediction,target)
    right = np.subtract(error,target)
    new_theta = np.dot(np.transpose(train),right)
    new_theta = new_theta/len(train)
    new_theta = np.dot(new_theta,learningRate)
    return new_theta,error

def j_theta(theta,prediction,target):
    # All are NP arrays
    targetTranspose = np.transpose(target)
    left = - np.dot(targetTranspose,np.log(prediction))
    temp = (1-target)
    right = np.dot(np.transpose(temp),np.log(1-prediction))
    cost = np.dot((1/len(train)),np.subtract(left,right))
    return cost

def sigmoid_pred(theta,train):
    thetaTranspose = np.transpose(theta)
    designMat = -np.dot(train,theta)
    temp = np.exp(designMat)
    predict = np.divide(1,(1+temp))
    return predict

In [66]:
raw_pos_data = create_setting_one(hum_obs_master_data,hum_obs_pos_data)
raw_neg_data = create_setting_one(hum_obs_master_data,hum_obs_neg_data)

In [9]:
if(mode ==1):
    raw_pos_data = create_setting_one(hum_obs_master_data,hum_obs_pos_data)
    raw_neg_data = create_setting_one(hum_obs_master_data,hum_obs_neg_data.sample(len(raw_pos_data),random_state = 444))
    if(subMode == 'b'):
        raw_pos_data = create_setting_two(raw_pos_data)
        raw_neg_data = create_setting_two(raw_neg_data)
        del hum_obs_master_data,hum_obs_pos_data,hum_obs_neg_data
elif(mode == 2):
    # High Memory -> NEED TO FIX
    raw_pos_data = create_setting_one(gsc_master_data,gsc_pos_data)
    raw_neg_data = create_setting_one(gsc_master_data,gsc_neg_data.sample(len(gsc_pos_data)))
    if(subMode == 'b'):
        raw_pos_data = create_setting_two(raw_pos_data)
        raw_neg_data = create_setting_two(raw_neg_data)
        del gsc_master_data,gsc_pos_data,gsc_neg_data

In [41]:
'''
Various Setting Generations
Oversampling = o
Undersampling = u
Perfect = p
'''
sampling = 'p'

In [67]:
'''
Partition Scheme
unseenWriter = true
default = false
'''
partScheme = False
if(partScheme):
    # Unseen Writer partitions
    raw_data_feature_concat_pos[['A','A_imgNo']] = raw_data_feature_concat_pos['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    raw_data_feature_concat_pos[['B','B_imgNo']] = raw_data_feature_concat_pos['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    #raw_data_feature_concat['img_id_A'].str.extract('(?P<writerA>\d\d\d\d)(?P<imageNo>[abcd])', expand=False)
    raw_data_feature_concat_neg[['A','A_imgNo']] = raw_data_feature_concat_neg['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    raw_data_feature_concat_neg[['B','B_imgNo']] = raw_data_feature_concat_neg['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    data = pd.concat([raw_pos_data,raw_neg_data],ignore_index=True)
else:
    data = pd.concat([raw_pos_data,raw_neg_data],ignore_index=True)
data = data.iloc[:,2:np.shape(data)[1]]

In [68]:
if(mode == 2):
    if(subMode == 'a'):
        M = 9
else:
    M = 9
data = data.sample(frac=1)
train,test,val = stratifiedSampling(data=data,seed=421)
if(sampling == 'o'):
    pos = train[train['target'] == 1]
    neg = train[train['target'] == 0]
    train_pos = pd.concat([pos,pos,pos,pos,pos],ignore_index=True)
    train_neg = neg.sample(n=len(pos))
    train = pd.concat([train_pos,train_neg])
    del pos,neg,train_pos,train_neg
train = train.sample(frac=1,random_state=444).reset_index().iloc[:,1:]
train_lab = train.iloc[:,train.columns == 'target']
val_lab = val.iloc[:,val.columns == 'target']
test_lab = test.iloc[:,test.columns == 'target']
train = train.iloc[:,train.columns != 'target']
val = val.iloc[:,val.columns != 'target']
test = val.iloc[:,test.columns != 'target']
#print(data.head())

In [39]:
second = np.multiply((1-target),np.log(1-sigmoid_pred(train)))

In [None]:
def alt_j_theta(theta,prediction,target):
    first = np.multiply(-target,np.log(sigmoid_pred(theta,train)))

In [55]:
np.shape(sigmoid_pred(theta,val))

(158, 1)

In [15]:
def sgd_sigmoid_pred(theta,train):
    thetaTranspose = np.transpose(theta)
    designMat = -np.dot(np.shape(test.values.reshape(1,len(train))),theta)
    temp = np.exp(designMat)
    predict = np.divide(1,(1+temp))
    return predict

In [35]:
sigmoid_pred(theta,train.iloc[1])

matrix([[0.44194041]])

In [19]:
def sgd_grad_descent(train,target,theta,learningRate):
    prediction = sigmoid_pred(theta,train)
    error = j_theta(theta,prediction,target)
    right = np.subtract(error,np.matrix(target))
    new_theta = np.dot(train.values.reshape((len(train),1)),right)
    new_theta = new_theta/len(train)
    new_theta = np.dot(new_theta,learningRate)
    return new_theta,error

In [487]:
prediction = sigmoid_pred(theta,train.iloc[1])

In [20]:
theta = np.matrix(np.random.rand(len(train.columns),1))

In [73]:
theta = np.matrix(np.random.rand(len(train.columns),1))
pError = 10
error = 0
while(True):
    pError = error
    theta,error = grad_descent(data.iloc[:,0:18],data.iloc[:,18:],theta,0.01)
    print(error)

[[19.40601837]]
[[10.56009285]]
[[5.76241939]]
[[3.24061904]]
[[2.04059425]]
[[1.54251797]]
[[1.35563517]]
[[1.28891774]]
[[1.26556776]]
[[1.25745464]]
[[1.25464286]]
[[1.25366925]]
[[1.25333223]]
[[1.25321558]]
[[1.2531752]]
[[1.25316123]]
[[1.25315639]]
[[1.25315472]]
[[1.25315414]]
[[1.25315394]]
[[1.25315387]]
[[1.25315385]]
[[1.25315384]]
[[1.25315384]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]
[[1.25315383]]


KeyboardInterrupt: 

In [496]:
theta = np.matrix(np.random.rand(len(train.columns),1))

In [45]:
theta

matrix([[0.0032781 ],
        [0.00293198],
        [0.00172877],
        [0.00302836],
        [0.00337208],
        [0.00385566],
        [0.0015884 ],
        [0.00364616],
        [0.00352538],
        [0.00351438],
        [0.00292716],
        [0.00170043],
        [0.00313336],
        [0.00268541],
        [0.00377859],
        [0.002218  ],
        [0.00400762],
        [0.00330314]])

In [64]:
theta

matrix([[0.00635392],
        [0.00564754],
        [0.00335665],
        [0.00552277],
        [0.00635862],
        [0.00726501],
        [0.00263925],
        [0.00697357],
        [0.00676408],
        [0.00632185],
        [0.00552594],
        [0.00331441],
        [0.0057552 ],
        [0.00544043],
        [0.00725573],
        [0.0037565 ],
        [0.00735264],
        [0.00637087]])

NameError: name 'i' is not defined

In [72]:
while(True):
    for i in range(0,len(train)):
        theta,error = sgd_grad_descent(train.iloc[i],train_lab.iloc[i],theta,0.1)
    print(error)

[[2.94883659e-06]]


KeyboardInterrupt: 

In [63]:
theta

matrix([[0.00635392],
        [0.00564754],
        [0.00335665],
        [0.00552277],
        [0.00635862],
        [0.00726501],
        [0.00263925],
        [0.00697357],
        [0.00676408],
        [0.00632185],
        [0.00552594],
        [0.00331441],
        [0.0057552 ],
        [0.00544043],
        [0.00725573],
        [0.0037565 ],
        [0.00735264],
        [0.00637087]])

In [52]:
sigmoid_pred()

Unnamed: 0,fa1,fa2,fa3,fa4,fa5,fa6,fa7,fa8,fa9,fb1,fb2,fb3,fb4,fb5,fb6,fb7,fb8,fb9
1023,2,1,0,2,2,2,0,2,2,0,1,1,0,0,2,1,1,1
269,3,1,1,0,2,1,1,1,2,2,2,1,3,2,2,1,2,2
544,1,1,1,3,2,3,1,2,1,0,2,1,0,2,1,1,0,2
1110,1,1,1,1,2,3,0,4,2,2,1,0,2,2,1,1,1,2
74,3,1,1,0,2,2,1,0,2,2,4,1,0,2,2,0,2,2
595,3,4,1,3,0,2,0,3,2,3,0,1,3,2,2,3,1,2
1214,3,1,0,2,2,3,0,2,2,3,1,1,3,0,1,3,4,2
537,0,1,1,1,2,1,1,2,2,1,2,0,2,2,2,0,2,2
286,3,0,1,3,2,2,0,3,2,3,4,1,0,2,2,1,0,2
1071,2,1,1,3,2,1,1,3,2,2,0,1,0,2,1,1,3,2


In [76]:
y_true = pd.Series(np.array(data.iloc[:,18:]))
y_pred = pd.Series(np.array((np.around(sigmoid_pred(theta,), 0))).ravel())

pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Exception: Data must be 1-dimensional