In [1]:
import pandas as pd
import numpy as np
from GPyOpt.methods.bayesian_optimization import BayesianOptimization

In [2]:
def duplicate_by_symmetry(x):
    result = []
    for row in x:
        num_of_features = int((len(row) - 1) / 2)
        left = row[:num_of_features]
        right = row[num_of_features:-1]
        label = row[-1]
        result.append(np.concatenate((left, right, [label])))
        result.append(np.concatenate((right, left, [label])))
    return np.array(result)

train = duplicate_by_symmetry(np.load('dev_train.npy'))
val = np.load('dev_val.npy')
print(train.shape)
print(val.shape)

(4400, 1025)
(1000, 1025)


In [3]:
num_features = int((train.shape[1] - 1)/2)
y_train = train[:, -1]
X_train = train[:, :-1]
train_left = X_train[:, :num_features]
train_right = X_train[:, -num_features:]
print(train_left.shape)
print(train_right.shape)
print(y_train.shape)

(4400, 512)
(4400, 512)
(4400,)


In [4]:
num_features = int((val.shape[1] - 1)/2)
y_val = val[:, -1]
X_val = val[:, :-1]
val_left = X_val[:, :num_features]
val_right = X_val[:, -num_features:]
print(val_left.shape)
print(val_right.shape)
print(y_val.shape)

(1000, 512)
(1000, 512)
(1000,)


In [5]:
dif = train_left - train_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
train_dist_array = np.sqrt(sq_dif_sum)
train_min = train_dist_array.min()
train_max = train_dist_array.max()
print(train_min)
print(train_max)
train_dist_array

18.494476926073368
153.05186867013396


array([ 90.08119256,  90.08119256, 107.26956283, ..., 105.05472173,
       116.64199412, 116.64199412])

In [6]:
dif = val_left - val_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
val_dist_array = np.sqrt(sq_dif_sum)
val_dist_array

array([ 71.71583614,  65.58225262,  67.57523042,  69.66779834,
        86.9650747 ,  75.87647397,  29.36378789,  64.46181792,
        32.29717397,  86.20594089,  46.87712347,  87.88107695,
       109.86244122,  64.41301111,  89.05109275,  85.5560427 ,
        62.76604106,  56.90238737, 100.92877219, 102.33083528,
       104.38804895, 114.05812372,  63.4234495 ,  53.4183103 ,
        63.22438221, 106.94084457,  96.72377975,  48.56441394,
        75.71141661,  83.06833836, 101.26798149,  90.08772747,
        82.49640306,  64.81580906,  83.55616409,  98.2661358 ,
        97.18907234, 100.75273068,  91.60002189,  41.60928843,
        67.03573735, 110.93590343,  73.7137322 ,  33.16041282,
        27.78569381,  83.40270956, 105.0675216 ,  56.29800553,
        92.29608552, 108.79267389,  77.26308273,  63.31505929,
        54.00858181,  53.26817354, 119.9714672 , 116.08821386,
        54.42073036,  69.8973353 ,  90.9792047 ,  93.38266889,
        96.88304478,  65.81182369,  66.35463749, 103.97

In [7]:
def obj_func(threshold: float) -> float:
    decision = (train_dist_array < threshold).astype(int)
    accuracy = np.mean(decision == y_train)
    return -accuracy
def evaluate(dist_array: np.array, true: np.array, threshold: float) -> float:
    decision = (dist_array < threshold).astype(int)
    return np.mean(decision == true)

In [8]:
domain = [{'name': 'threshold', 'type': 'continuous', 'domain': (train_min, train_max)}]
max_iter = 30
BO = BayesianOptimization(f = obj_func, domain = domain)
BO.run_optimization(max_iter=max_iter)

In [9]:
opt_values = BO.x_opt
print(opt_values)

[89.29002616]


In [10]:
obj_func(*opt_values)

-0.7659090909090909

In [11]:
evaluate(train_dist_array, y_train, threshold=opt_values[0])

0.7659090909090909

In [12]:
evaluate(val_dist_array, y_val, threshold=opt_values[0])

0.766