In [1]:
import pandas as pd
import numpy as np
from GPyOpt.methods.bayesian_optimization import BayesianOptimization

In [2]:
train = np.genfromtxt('dev_train.csv', delimiter=',')
val = np.genfromtxt('dev_val.csv', delimiter=',')
print(train.shape)
print(val.shape)

(2200, 513)
(1000, 513)


In [3]:
num_features = int((train.shape[1] - 1)/2)
y_train = train[:, -1]
X_train = train[:, :-1]
train_left = X_train[:, :num_features]
train_right = X_train[:, -num_features:]
print(train_left.shape)
print(train_right.shape)
print(y_train.shape)

(2200, 256)
(2200, 256)
(2200,)


In [4]:
num_features = int((val.shape[1] - 1)/2)
y_val = val[:, -1]
X_val = val[:, :-1]
val_left = X_val[:, :num_features]
val_right = X_val[:, -num_features:]
print(val_left.shape)
print(val_right.shape)
print(y_val.shape)

(1000, 256)
(1000, 256)
(1000,)


In [5]:
dif = train_left - train_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
train_dist_array = np.sqrt(sq_dif_sum)
train_min = train_dist_array.min()
train_max = train_dist_array.max()
print(train_min)
print(train_max)
train_dist_array

8.19149381090882
51.58636544945832


array([32.47018069, 39.78648363, 24.85542514, ..., 26.03851128,
       38.06233713, 39.1363846 ])

In [6]:
dif = val_left - val_right
sq_dif = dif ** 2
sq_dif_sum = np.sum(sq_dif, axis=1)
val_dist_array = np.sqrt(sq_dif_sum)
val_dist_array

array([18.48512654, 18.16245466, 19.43100783, 29.81722336, 27.34061477,
       27.17830282,  7.94485275, 32.91763112, 14.66311638, 36.81665081,
       23.67859318, 21.08047659, 40.2635266 , 26.57685752, 40.88894785,
       23.60097078, 23.82116595, 19.39419183, 29.74507852, 32.64209453,
       35.18590473, 42.49349801, 34.44995789, 22.01295298, 34.8011679 ,
       38.21048507, 35.30228814, 17.56882924, 28.53273674, 29.07237674,
       31.77827063, 36.76962292, 30.94927595, 31.24955142, 34.9322073 ,
       38.27672821, 38.87050358, 38.27276305, 42.94674814, 12.08396449,
       24.49047019, 41.88462415, 30.09102777, 22.25148176, 26.71531326,
       24.95989915, 38.74372519, 24.0607745 , 27.43431915, 25.76508595,
       31.12243745, 30.20157523, 21.9821999 , 28.11604879, 33.22277286,
       33.66645922, 17.13441683, 21.55971005, 32.56657795, 35.92168081,
       34.72377175, 17.18254648, 29.22724875, 42.93538365, 40.79722922,
       30.04302027, 36.86535315, 21.99383519, 36.00360432, 32.17

In [7]:
def obj_func(threshold: float) -> float:
    decision = (train_dist_array < threshold).astype(int)
    accuracy = np.mean(decision == y_train)
    return -accuracy
def evaluate(dist_array: np.array, true: np.array, threshold: float) -> float:
    decision = (dist_array < threshold).astype(int)
    return np.mean(decision == true)

In [8]:
domain = [{'name': 'threshold', 'type': 'continuous', 'domain': (train_min, train_max)}]
max_iter = 30
BO = BayesianOptimization(f = obj_func, domain = domain)
BO.run_optimization(max_iter=max_iter)

In [9]:
opt_values = BO.x_opt
print(opt_values)

[33.23936447]


In [10]:
obj_func(*opt_values)

-0.7195454545454546

In [11]:
evaluate(val_dist_array, y_val, threshold=opt_values[0])

0.718