In [1]:
import pandas as pd
import numpy as np
from GPyOpt.methods.bayesian_optimization import BayesianOptimization

In [2]:
def duplicate_by_symmetry(x):
    result = []
    for row in x:
        num_of_features = int((len(row) - 1) / 2)
        left = row[:num_of_features]
        right = row[num_of_features:-1]
        label = row[-1]
        result.append(np.concatenate((left, right, [label])))
        result.append(np.concatenate((right, left, [label])))
    return np.array(result)

train = duplicate_by_symmetry(np.load('dev_train.npy'))
val = np.load('dev_val.npy')
print(train.shape)
print(val.shape)

(4400, 1025)
(1000, 1025)


In [3]:
num_features = int((train.shape[1] - 1)/2)
y_train = train[:, -1]
X_train = train[:, :-1]
train_left = X_train[:, :num_features]
train_right = X_train[:, -num_features:]
print(train_left.shape)
print(train_right.shape)
print(y_train.shape)

(4400, 512)
(4400, 512)
(4400,)


In [4]:
num_features = int((val.shape[1] - 1)/2)
y_val = val[:, -1]
X_val = val[:, :-1]
val_left = X_val[:, :num_features]
val_right = X_val[:, -num_features:]
print(val_left.shape)
print(val_right.shape)
print(y_val.shape)

(1000, 512)
(1000, 512)
(1000,)


In [5]:
dif = train_left - train_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
train_dist_array = np.sqrt(sq_dif_sum)
train_min = train_dist_array.min()
train_max = train_dist_array.max()
print(train_min)
print(train_max)
train_dist_array

9.844759513904776
41.47293947452709


array([26.94699888, 26.94699888, 34.04742772, ..., 34.24019784,
       35.60991618, 35.60991618])

In [6]:
dif = val_left - val_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
val_dist_array = np.sqrt(sq_dif_sum)
val_dist_array

array([16.59304683, 18.63296629, 12.92118671, 23.49550194, 24.99152602,
       17.67830326, 10.69989754, 26.97984263, 18.94396146, 27.09610682,
       20.17717325, 18.24813389, 34.99575114, 28.27913985, 32.6643496 ,
       21.14015435, 15.89333792, 17.05454616, 25.11725152, 30.99399412,
       30.67121416, 31.19935409, 27.50366082, 23.94760672, 30.48458878,
       35.15201141, 30.45735425, 14.83325888, 26.48795866, 30.98287949,
       28.02017917, 33.29478859, 28.42437836, 30.54343948, 23.96089054,
       27.72669319, 32.62226501, 32.29063016, 34.87799236, 12.50716133,
       22.04377696, 28.90664683, 30.57082891, 21.39628423, 26.53065689,
       26.83907865, 27.56692145, 20.70103078, 22.51232676, 21.86217418,
       23.19760951, 21.89067066, 16.60830397, 20.81101097, 31.43480092,
       26.00890147, 16.97479355, 22.61501554, 25.18269065, 30.09712782,
       28.29504199, 17.87439051, 28.23123766, 29.55968265, 30.58823944,
       26.44498529, 25.92696637, 22.12707239, 25.40618462, 22.02

In [7]:
def obj_func(threshold: float) -> float:
    decision = (train_dist_array < threshold).astype(int)
    accuracy = np.mean(decision == y_train)
    return -accuracy
def evaluate(dist_array: np.array, true: np.array, threshold: float) -> float:
    decision = (dist_array < threshold).astype(int)
    return np.mean(decision == true)

In [8]:
domain = [{'name': 'threshold', 'type': 'continuous', 'domain': (train_min, train_max)}]
max_iter = 30
BO = BayesianOptimization(f = obj_func, domain = domain)
BO.run_optimization(max_iter=max_iter)

In [9]:
opt_values = BO.x_opt
print(opt_values)

[28.41869693]


In [10]:
obj_func(*opt_values)

-0.7122727272727273

In [11]:
evaluate(train_dist_array, y_train, threshold=opt_values[0])

0.7122727272727273

In [12]:
evaluate(val_dist_array, y_val, threshold=opt_values[0])

0.7