In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
from matplotlib import pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)

In [2]:
'''
Logistic Regression README:
Modes:
1. Human Observed Dataset
2. GSC
Feature Type:
a. Feature Concat
b. Feature Subs
'''
mode = 2
subMode = 'b'

In [3]:
# GSC or HOD??
if(mode == 1):
    hum_obs_master_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv")
    hum_obs_pos_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")
    hum_obs_neg_data = pd.read_csv("../HumanObserved-Dataset/HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")
elif(mode ==2):
    gsc_master_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/GSC-Features.csv")
    gsc_pos_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/same_pairs.csv")
    gsc_neg_data = pd.read_csv("../GSC-Dataset/GSC-Dataset/GSC-Features-Data/diffn_pairs.csv")

In [4]:
def create_setting_one(master_data,pos_data):
    raw_data_temp = pd.concat([pos_data.set_index('img_id_A'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    raw_data_feature_concat = pd.concat([raw_data_temp.set_index('img_id_B'),master_data.set_index('img_id')],axis=1,join='inner').reset_index()
    if(np.shape(raw_data_feature_concat)[1] < 25):
        raw_data_feature_concat.drop(['Unnamed: 0'],axis=1,inplace=True)
        num_features = 9 + 1
    else:
        num_features = 512+1
    col_rename = ['img_id_B','img_id_A','target']
    for columns in range(1,len(list(raw_data_feature_concat.columns)[3:])+1):
        if(columns < num_features):
            col_rename.append("fa"+str(columns))
        else:
            col_rename.append("fb"+str(columns - num_features+1))
    raw_data_feature_concat.columns = col_rename
    col_rename.append(col_rename.pop(2))
    temp = col_rename[0]
    col_rename[0] = col_rename[1]
    col_rename[1] = temp
    raw_data_feature_concat = raw_data_feature_concat[col_rename]
    return raw_data_feature_concat

def create_setting_two(raw_data_feature_concat):
    raw_data_feature_subs = pd.concat([raw_data_feature_concat.iloc[:,0:2],raw_data_feature_concat.iloc[:,-1]],axis=1,join='inner').reset_index()
    for columns in range(1,int((len(list(raw_data_feature_concat.columns))-3)/2+1)):
        raw_data_feature_subs['fm'+str(columns)] = abs(raw_data_feature_concat['fa'+str(columns)] - raw_data_feature_concat['fb'+str(columns)])
    col_swap = list(raw_data_feature_subs.columns)[1:]
    col_swap.append(col_swap.pop(2))
    raw_data_feature_subs=raw_data_feature_subs[col_swap]
    return raw_data_feature_subs

def stratifiedSampling(data,seed):
    train,test_val = train_test_split(data,test_size = 0.2,stratify=data[["target"]],random_state=seed)
    val,test = train_test_split(test_val,test_size = 0.5,stratify=test_val[["target"]],random_state=seed)
    return train,test,val

def grad_descent(train,target,theta):
    prediction = sigmoid_pred(theta,train)
    error = j_theta(theta,prediction,target)
    right = np.subtract(prediction,target)
    new_theta = np.dot(np.transpose(train),right)
    return new_theta,error

def j_theta(theta,data,target):
    # All are NP arrays
    prediction = sigmoid_pred(theta,train)
    targetTranspose = np.transpose(target)
    left = np.dot(targetTranspose,np.log(prediction))
    temp = (1-target)
    right = np.dot(np.transpose(temp),np.log(1-prediction))
    cost = -np.dot((1/len(train)),np.add(left,right))
    return cost

def sigmoid(x):
    temp = np.exp(-x)
    return 1/(1+temp)

def sigmoid_pred(theta,train):
    thetaTranspose = np.transpose(theta)
    designMat = np.dot(train,theta)
    predict = sigmoid(designMat)
    return predict

In [5]:
if(mode ==1):
    raw_pos_data = create_setting_one(hum_obs_master_data,hum_obs_pos_data)
    raw_neg_data = create_setting_one(hum_obs_master_data,hum_obs_neg_data.sample(len(raw_pos_data),random_state = 444))
    if(subMode == 'b'):
        raw_pos_data = create_setting_two(raw_pos_data)
        raw_neg_data = create_setting_two(raw_neg_data)
        del hum_obs_master_data,hum_obs_pos_data,hum_obs_neg_data
elif(mode == 2):
    # High Memory -> NEED TO FIX
    raw_pos_data = create_setting_one(gsc_master_data,gsc_pos_data)
    raw_neg_data = create_setting_one(gsc_master_data,gsc_neg_data.sample(len(gsc_pos_data)))
    if(subMode == 'b'):
        raw_pos_data = create_setting_two(raw_pos_data)
        raw_neg_data = create_setting_two(raw_neg_data)
        del gsc_master_data,gsc_pos_data,gsc_neg_data

In [6]:
'''
Various Setting Generations
Oversampling = o
Undersampling = u
Perfect = p
'''
sampling = 'p'

In [7]:
'''
Partition Scheme
unseenWriter = true
default = false
'''
partScheme = False
if(partScheme):
    # Unseen Writer partitions
    raw_data_feature_concat_pos[['A','A_imgNo']] = raw_data_feature_concat_pos['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    raw_data_feature_concat_pos[['B','B_imgNo']] = raw_data_feature_concat_pos['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    #raw_data_feature_concat['img_id_A'].str.extract('(?P<writerA>\d\d\d\d)(?P<imageNo>[abcd])', expand=False)
    raw_data_feature_concat_neg[['A','A_imgNo']] = raw_data_feature_concat_neg['img_id_A'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    raw_data_feature_concat_neg[['B','B_imgNo']] = raw_data_feature_concat_neg['img_id_B'].str.extract('(\d\d\d\d)([a-z])', expand=False)
    data = pd.concat([raw_pos_data,raw_neg_data],ignore_index=True)
else:
    data = pd.concat([raw_pos_data,raw_neg_data],ignore_index=True)
data = data.iloc[:,2:np.shape(data)[1]]

In [8]:
data = data.sample(frac=1,random_state=444)
train,test,val = stratifiedSampling(data=data,seed=421)
if(sampling == 'o'):
    # Does not work since above we already sample half
    pos = train[train['target'] == 1]
    neg = train[train['target'] == 0]
    train_pos = pd.concat([pos,pos,pos,pos,pos],ignore_index=True)
    train_neg = neg.sample(n=len(pos))
    train = pd.concat([train_pos,train_neg])
    del pos,neg,train_pos,train_neg
train = train.sample(frac=1,random_state=444).reset_index().iloc[:,1:]
train_lab = train.iloc[:,train.columns == 'target']
val_lab = val.iloc[:,val.columns == 'target']
test_lab = test.iloc[:,test.columns == 'target']
train = train.iloc[:,train.columns != 'target']
val = val.iloc[:,val.columns != 'target']
test = val.iloc[:,test.columns != 'target']
#print(data.head())

In [15]:
def sgd_sigmoid_pred(theta,train):
    thetaTranspose = np.transpose(theta)
    designMat = -np.dot(np.shape(test.values.reshape(1,len(train))),theta)
    temp = np.exp(designMat)
    predict = np.divide(1,(1+temp))
    return predict

In [52]:
def alt_sigmod(x):
    return 1/(1+np.exp(-x))

def cost(theta,X,y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y,np.log(alt_sigmod(X*theta)))
    second = np.multiply((1-y),np.log(1-alt_sigmod(X*theta)))
    return np.sum(first-second) / (len(X))



def gradient(theta,X,y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    param = int(theta.ravel().shape[1])
    grad = np.zeros(param)
    
    error = alt_sigmod(X* theta) - y
    
    cost(theta,X,y)
    
    grad = np.dot(np.transpose(X),error)
    
    return grad,cost(theta,X,y)



In [62]:
theta = np.zeros((len(train.columns),1))
alpha = 0.0000016
pError = 10
error = 0
i = 0
while(abs(pError-error) > 0 or error < pError):
    pError = error
    ntheta,error = grad_descent(train,train_lab,theta)
    theta = theta - alpha*np.matrix(ntheta + (0.5* theta))
    if(i % 100 == 0):
        print(error)
        #np.savetxt(sample, theta)
    i +=1

[[0.69314718]]
[[0.55867391]]
[[0.54827992]]


KeyboardInterrupt: 

In [496]:
theta = np.matrix(np.random.rand(len(train.columns),1))

In [28]:
np.shape(np.transpose(theta))

(1, 18)

In [13]:
theta

matrix([[0.00635392],
        [0.00564754],
        [0.00335665],
        [0.00552277],
        [0.00635862],
        [0.00726501],
        [0.00263925],
        [0.00697357],
        [0.00676408],
        [0.00632185],
        [0.00552594],
        [0.00331441],
        [0.0057552 ],
        [0.00544043],
        [0.00725573],
        [0.0037565 ],
        [0.00735264],
        [0.00637087]])

In [302]:
while(True):
    for i in range(0,len(train)):
        theta,error = sgd_grad_descent(train.iloc[i],train_lab.iloc[i],theta,0.1)
    print(error)

KeyboardInterrupt: 

In [300]:
confusion_matrix(val_lab,np.around(sigmoid_pred(theta,val),0))

array([[5546, 1607],
       [2261, 4892]])

In [14]:
confusion_matrix(val_lab,np.around(sigmoid_pred(theta,val),0))

array([[44135, 13090],
       [18200, 39024]])

In [78]:
sigmoid_pred(theta,train)

matrix([[0.54508898],
        [0.54658737],
        [0.5472247 ],
        ...,
        [0.56201036],
        [0.54430116],
        [0.5408493 ]])

In [None]:
00

In [24]:
y_true = pd.Series(np.asarray(train_lab))
y_pred = pd.Series(np.array((np.around(sigmoid_pred(theta,train_lab),0))))

pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Exception: Data must be 1-dimensional

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [299]:
y_true = pd.Series(np.asarray(data.iloc[:,18:]).ravel())
y_pred = pd.Series(np.array((np.around(sigmoid_pred(theta,data.iloc[:,0:18]), 0))).ravel())

pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

ValueError: shapes (143062,18) and (512,1) not aligned: 18 (dim 1) != 512 (dim 0)

In [86]:
sigmoid_pred(theta,data.iloc[:,0:18])

matrix([[0.6525805 ],
        [0.62830898],
        [0.59301233],
        ...,
        [0.65292601],
        [0.62222231],
        [0.6360943 ]])