In [3]:
import pandas as pd
import numpy as np
from GPyOpt.methods.bayesian_optimization import BayesianOptimization

In [4]:
def duplicate_by_symmetry(x):
    result = []
    for row in x:
        num_of_features = int((len(row) - 1) / 2)
        left = row[:num_of_features]
        right = row[num_of_features:-1]
        label = row[-1]
        result.append(np.concatenate((left, right, [label])))
        result.append(np.concatenate((right, left, [label])))
    return np.array(result)

train = duplicate_by_symmetry(np.load('dev_train.npy'))
val = np.load('dev_val.npy')
print(train.shape)
print(val.shape)

(4400, 1025)
(1000, 1025)


In [5]:
num_features = int((train.shape[1] - 1)/2)
y_train = train[:, -1]
X_train = train[:, :-1]
train_left = X_train[:, :num_features]
train_right = X_train[:, -num_features:]
print(train_left.shape)
print(train_right.shape)
print(y_train.shape)

(4400, 512)
(4400, 512)
(4400,)


In [6]:
num_features = int((val.shape[1] - 1)/2)
y_val = val[:, -1]
X_val = val[:, :-1]
val_left = X_val[:, :num_features]
val_right = X_val[:, -num_features:]
print(val_left.shape)
print(val_right.shape)
print(y_val.shape)

(1000, 512)
(1000, 512)
(1000,)


In [7]:
dif = train_left - train_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
train_dist_array = np.sqrt(sq_dif_sum)
train_min = train_dist_array.min()
train_max = train_dist_array.max()
print(train_min)
print(train_max)
train_dist_array

0.12712637
1.6949717


array([0.2980312 , 0.2980312 , 0.29431146, ..., 1.2177899 , 0.7564921 ,
       0.7564921 ], dtype=float32)

In [8]:
dif = val_left - val_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
val_dist_array = np.sqrt(sq_dif_sum)
val_dist_array

array([0.27127743, 0.29785645, 0.275458  , 0.29756242, 0.33083576,
       0.19599384, 0.2530967 , 0.28122118, 0.3179703 , 0.45108834,
       0.30435604, 0.2197799 , 0.42024294, 0.19663209, 0.3385584 ,
       0.3025133 , 0.2495001 , 0.2935123 , 0.32590675, 0.33475462,
       0.3526543 , 0.40915653, 0.46102235, 0.29288444, 0.335482  ,
       0.37091333, 0.29409835, 0.21464308, 0.22870524, 0.32065374,
       0.46428397, 0.29303727, 0.39707378, 0.26189998, 0.26538008,
       0.27771318, 0.39594978, 0.5327553 , 0.31007385, 0.2360981 ,
       0.20868775, 0.29890907, 0.28879943, 0.26200548, 0.27689055,
       0.2697491 , 0.43481156, 0.31717637, 0.2419816 , 0.30277506,
       0.28859848, 0.38275102, 0.32588595, 0.3338573 , 0.49094695,
       0.3207308 , 0.22917455, 0.32240698, 0.409604  , 0.48246783,
       0.37750968, 0.23916239, 0.27085808, 0.2745618 , 0.29034647,
       0.26306134, 0.30478254, 0.2939572 , 0.2962383 , 0.24116342,
       0.22539946, 0.19774063, 0.3610624 , 0.2974004 , 0.23917

In [9]:
def obj_func(threshold: float) -> float:
    decision = (train_dist_array < threshold).astype(int)
    accuracy = np.mean(decision == y_train)
    return -accuracy
def evaluate(dist_array: np.array, true: np.array, threshold: float) -> float:
    decision = (dist_array < threshold).astype(int)
    return np.mean(decision == true)

In [10]:
domain = [{'name': 'threshold', 'type': 'continuous', 'domain': (train_min, train_max)}]
max_iter = 30
BO = BayesianOptimization(f = obj_func, domain = domain)
BO.run_optimization(max_iter=max_iter)

In [11]:
opt_values = BO.x_opt
print(opt_values)

[0.4783314]


In [12]:
obj_func(*opt_values)

-0.9513636363636364

In [13]:
evaluate(train_dist_array, y_train, threshold=opt_values[0])

0.9513636363636364

In [14]:
evaluate(val_dist_array, y_val, threshold=opt_values[0])

0.897