In [1]:
import pandas as pd
import numpy as np
from GPyOpt.methods.bayesian_optimization import BayesianOptimization

In [2]:
def duplicate_by_symmetry(x):
    result = []
    for row in x:
        num_of_features = int((len(row) - 1) / 2)
        left = row[:num_of_features]
        right = row[num_of_features:-1]
        label = row[-1]
        result.append(np.concatenate((left, right, [label])))
        result.append(np.concatenate((right, left, [label])))
    return np.array(result)

train = duplicate_by_symmetry(np.load('dev_train.npy'))
val = np.load('dev_val.npy')
print(train.shape)
print(val.shape)

(4400, 513)
(1000, 513)


In [3]:
num_features = int((train.shape[1] - 1)/2)
y_train = train[:, -1]
X_train = train[:, :-1]
train_left = X_train[:, :num_features]
train_right = X_train[:, -num_features:]
print(train_left.shape)
print(train_right.shape)
print(y_train.shape)

(4400, 256)
(4400, 256)
(4400,)


In [4]:
num_features = int((val.shape[1] - 1)/2)
y_val = val[:, -1]
X_val = val[:, :-1]
val_left = X_val[:, :num_features]
val_right = X_val[:, -num_features:]
print(val_left.shape)
print(val_right.shape)
print(y_val.shape)

(1000, 256)
(1000, 256)
(1000,)


In [5]:
dif = train_left - train_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
train_dist_array = np.sqrt(sq_dif_sum)
train_min = train_dist_array.min()
train_max = train_dist_array.max()
print(train_min)
print(train_max)
train_dist_array

8.20207725949634
51.7544559261606


array([32.60706759, 32.60706759, 39.76827864, ..., 37.94584422,
       39.06708172, 39.06708172])

In [6]:
dif = val_left - val_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
val_dist_array = np.sqrt(sq_dif_sum)
val_dist_array

array([18.567093  , 18.41183108, 19.35985157, 29.89635575, 27.41211936,
       27.16622446,  7.88627248, 33.99786398, 14.61180015, 36.88962038,
       23.53315793, 21.31049547, 40.23653819, 26.28351316, 40.83493646,
       23.58921477, 23.58403901, 19.62321935, 29.79691377, 32.82977711,
       35.33701976, 42.45228784, 34.60685492, 21.64650195, 34.64559145,
       38.50872431, 35.29472005, 17.64935001, 28.83074656, 29.60470849,
       31.68982184, 37.00643305, 31.02569662, 31.69404581, 34.81951405,
       38.28097602, 38.84024626, 38.27613726, 43.00647768, 12.06719096,
       24.0601185 , 41.63813318, 29.74805168, 22.07507007, 26.25036859,
       24.78674939, 38.77638161, 24.06967424, 27.63265876, 26.0619489 ,
       31.12807447, 30.26621586, 22.29847232, 28.22402542, 33.156346  ,
       33.33786202, 16.98111692, 21.60266374, 32.5547819 , 36.04539497,
       34.71505205, 17.24367869, 29.2374757 , 42.84233555, 40.96801975,
       30.29106497, 37.04865415, 22.26419676, 35.99605293, 32.23

In [7]:
def obj_func(threshold: float) -> float:
    decision = (train_dist_array < threshold).astype(int)
    accuracy = np.mean(decision == y_train)
    return -accuracy
def evaluate(dist_array: np.array, true: np.array, threshold: float) -> float:
    decision = (dist_array < threshold).astype(int)
    return np.mean(decision == true)

In [8]:
domain = [{'name': 'threshold', 'type': 'continuous', 'domain': (train_min, train_max)}]
max_iter = 30
BO = BayesianOptimization(f = obj_func, domain = domain)
BO.run_optimization(max_iter=max_iter)

In [9]:
opt_values = BO.x_opt
print(opt_values)

[32.28264086]


In [10]:
obj_func(*opt_values)

-0.7204545454545455

In [11]:
evaluate(train_dist_array, y_train, threshold=opt_values[0])

0.7204545454545455

In [12]:
evaluate(val_dist_array, y_val, threshold=opt_values[0])

0.722