In [4]:
# import local modules.
from model.k_nearest_neigbors import KNearestNeighbors
from model.lmnn import LMNN

from utils import data_loader, pre_training_analysis_tools


data = data_loader.load_real_dataset(sqaured=False, remove_multicollinearity=False)
output_xy = pre_training_analysis_tools.variance_threshold(data['train_x'],data['train_y'])
split_xy = pre_training_analysis_tools.split_dataxy(output_xy)
x = split_xy['train_x']
y = split_xy['train_y']

from skfeature.function.statistical_based import CFS
score = CFS.cfs(x.to_numpy() , y.to_numpy())    
x = x.iloc[:, [32, 131,  33,  39, 174,  26, 132, 144,   9,  10,  15,  20]]

data_xy = pre_training_analysis_tools.join_dataxy(x, y)
data = pre_training_analysis_tools.generate_train_test_xy(data_xy)


data_x = data['train_x']
data_y =     data['train_y']
test_x =     data['test_x']
test_y =   data['test_y']


Original feature shape: (216, 265)
Transformed feature shape: (216, 209)


In [12]:
## need to figue out how to use it first 
import numpy as np
from argparse import Namespace
from sklearn.neighbors import KNeighborsClassifier
from GPyOpt.methods import BayesianOptimization

from pylmnn import LargeMarginNearestNeighbor


def find_hyperparams(X_train, y_train, X_valid, y_valid, params=None, max_bopt_iter=1000):
    """Find the best hyperparameters for LMNN using Bayesian Optimisation.

    Parameters
    ----------

    X_train : array_like
           An array of training samples with shape (n_samples, n_features).

    y_train : array_like
           An array of training labels with shape (n_samples,).

    X_valid : array_like
           An array of validation samples with shape (m_samples, n_features).

    y_valid : array_like
           An array of validation labels with shape (m_samples,).

    params : dict
             A dictionary of parameters to be passed to the LargeMarginNearestNeighbor classifier instance.

    max_bopt_iter : int
            Maximum number of parameter configurations to evaluate (Default value = 12).

    Returns
    -------
    tuple:
        (int, int, int, int) The best hyperparameters found (n_neighbors, n_neighbors_predict, n_components, max_iter).

    """

    params = params or {}
    unique_labels, class_sizes = np.unique(y_train, return_counts=True)
    min_class_size = min(class_sizes)

    # Setting parameters for Bayesian Global Optimization
    args = Namespace()
    args.min_neighbors = 1
    args.max_neighbors = int(min(min_class_size - 1, 15))
    args.min_iter = 10
    args.max_iter = 200
    args.min_components = min(X_train.shape[1], 2)
    args.max_components = X_train.shape[1]

    bopt_iter = 0

    def optimize_clf(hyperparams):
        """The actual objective function with packing and unpacking of hyperparameters.

        Parameters
        ----------
        hyperparams : array_like
                 Vector of hyperparameters to evaluate.

        Returns
        -------
        float
            The validation error obtained.

        """

        hyperparams = hyperparams[0]
        n_neighbors = int(round(hyperparams[0]))
        n_neighbors_predict = int(round(hyperparams[1]))
        n_components = int(np.ceil(hyperparams[2]))
        max_iter = int(np.ceil(hyperparams[3]))
        weight_push_loss = float(np.ceil(hyperparams[4]))
        max_impostors = int(np.ceil(hyperparams[5]))

        nonlocal bopt_iter
        bopt_iter += 1
        print('Iteration {} of Bayesian Optimisation'.format(bopt_iter))
        print('Trying n_neighbors(lmnn)={}\tn_neighbors(knn)={}\tn_components={}\tmax_iter={} ...\n'
              .format(n_neighbors, n_neighbors_predict, n_components, max_iter, weight_push_loss,max_impostors))
        lmnn = LargeMarginNearestNeighbor(n_neighbors, max_iter=max_iter, n_components=n_components, 
                                          weight_push_loss=weight_push_loss,max_impostors=max_impostors, **params)
        lmnn.fit(X_train, y_train)
        clf = KNeighborsClassifier(n_neighbors=n_neighbors)
        clf.fit(lmnn.transform(X_train), y_train)

        print('Evaluating the found transformation on validation set of size {}...'.format(len(y_valid)))
        val_err = 1. - clf.score(lmnn.transform(X_valid), y_valid)

        print('Validation error={:2.4f}\n'.format(val_err))
        return val_err

    # Parameters are discrete but treating them as continuous yields better parameters
    domain = [{'name': 'n_neighbors', 'type': 'continuous', 'domain': (args.min_neighbors, args.max_neighbors)},
              {'name': 'n_neighbors_predict', 'type': 'continuous', 'domain': (args.min_neighbors, args.max_neighbors)},
              {'name': 'n_components', 'type': 'continuous', 'domain': (args.min_components, args.max_components)},
              {'name': 'max_iter', 'type': 'continuous', 'domain': (args.min_iter, args.max_iter)},
              {'name': 'weight_push_loss', 'type': 'continuous', 'domain': (0, 1)},
              {'name': 'max_impostors', 'type': 'continuous', 'domain': (1, 1000000)}]
    bo = BayesianOptimization(f=optimize_clf, domain=domain)
    bo.run_optimization(max_iter=max_bopt_iter)

    solution = bo.x_opt
    print(solution)
    best_n_neighbors = int(round(solution[0]))
    best_n_neighbors_predict = int(round(solution[1]))
    best_n_components = int(np.ceil(solution[2]))
    best_max_iter = int(np.ceil(solution[3]))
    best_weight_push_loss = float(np.ceil(solution[4]))
    best_max_impostors = int(np.ceil(solution[5]))

    print('Best parameters: n_neighbors(lmnn)={} n_neighbors(knn)={} n_components={} max_iter={}\n'.
          format(best_n_neighbors, best_n_neighbors_predict, best_n_components, best_max_iter, best_weight_push_loss, best_max_impostors))

    return best_n_neighbors, best_n_neighbors_predict, best_n_components, best_max_iter, best_weight_push_loss, best_max_impostors

#execute


find_hyperparams(X_train=data_x, y_train=data_y, X_valid=test_x, y_valid=test_y, max_bopt_iter=12)




Iteration 1 of Bayesian Optimisation
Trying n_neighbors(lmnn)=8	n_neighbors(knn)=11	n_components=3	max_iter=85 ...

Evaluating the found transformation on validation set of size 23...
Validation error=0.5217

Iteration 2 of Bayesian Optimisation
Trying n_neighbors(lmnn)=3	n_neighbors(knn)=4	n_components=10	max_iter=56 ...

Evaluating the found transformation on validation set of size 23...
Validation error=0.5652

Iteration 3 of Bayesian Optimisation
Trying n_neighbors(lmnn)=1	n_neighbors(knn)=13	n_components=4	max_iter=71 ...

Evaluating the found transformation on validation set of size 23...
Validation error=0.4783

Iteration 4 of Bayesian Optimisation
Trying n_neighbors(lmnn)=9	n_neighbors(knn)=12	n_components=12	max_iter=53 ...

Evaluating the found transformation on validation set of size 23...
Validation error=0.6087

Iteration 5 of Bayesian Optimisation
Trying n_neighbors(lmnn)=12	n_neighbors(knn)=1	n_components=8	max_iter=159 ...

Evaluating the found transformation on validat

(1, 13, 4, 71, 1.0, 324503)

In [16]:
from pylmnn import LargeMarginNearestNeighbor as LMNN
from sklearn.neighbors import KNeighborsClassifier

# Set up the hyperparameters
# Instantiate the metric learner
#lmnn = LMNN(n_neighbors=13, max_iter=63, n_components=2 )
# adding make no difference: weight_push_loss=1.0
# max_impostors=871159
#lmnn = LMNN(n_neighbors=6, max_iter=80, n_components=6 )
lmnn = LMNN(n_neighbors=10, max_iter=109, n_components=6 )


#10, 5, 6, 109, 1.0, 646122

# Train the metric learner
lmnn.fit(data_x, data_y)

# Fit the nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(lmnn.transform(data_x), data_y)

# Compute the k-nearest neighbor test accuracy after applying the learned transformation
lmnn_acc = knn.score(lmnn.transform(test_x), test_y)
print('LMNN accuracy on test set of {} points: {:.4f}'.format(test_x.shape[0], lmnn_acc))


# 0.8 cor & new data set & cfs = 0.5179

LMNN accuracy on test set of 23 points: 0.5217
