In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report

from pylmnn import LargeMarginNearestNeighbor as LMNN


# Load a data set
X, y = load_iris(return_X_y=True)

# Split in training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Set up the hyperparameters
k_train, k_test, n_components, max_iter = 3, 3, X.shape[1], 180

# Instantiate the metric learner
lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components)

# Train the metric learner
lmnn.fit(X_train, y_train)

# Fit the nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=k_test)
knn.fit(lmnn.transform(X_train), y_train)

# Compute the k-nearest neighbor test f1-score after applying the learned transformation
lmnn_acc = knn.score(lmnn.transform(X_test), y_test)
print('LMNN accuracy on test set of {} points: {:.4f}'.format(X_test.shape[0], lmnn_acc))

LMNN accuracy on test set of 45 points: 0.9778


In [11]:
lmnn_pred = knn.predict(lmnn.transform(X_test))
print(classification_report(y_test, lmnn_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.94      1.00      0.97        15
           2       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [12]:
# test normal KNN
clf = KNeighborsClassifier(n_neighbors=k_test)
clf.fit(X_train, y_train)

eucilidean_acc = clf.score(X_test, y_test)
print('accuracy on test set of {} points: {:.4f}'.format(X_test.shape[0], eucilidean_acc))

accuracy on test set of 45 points: 0.9556


In [13]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.88      1.00      0.94        15
           2       1.00      0.87      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



In [2]:
# test CIFAR10 data set with grid search
from load_data import load_CIFAR10
import numpy as np
import matplotlib.pyplot as plt

cifar10_dir = 'cifar-10-batches-py'

# clear variable 
try:
   del X_train, y_train
   del X_test, y_test
   print('clean the variable which has been imported...Done!')
except:
   pass

# read data and split into train and test
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

In [3]:
print ("Train Data and test data:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print ("Labels: ", np.unique(y_train))

Train Data and test data: (50000, 32, 32, 3) (50000,) (10000, 32, 32, 3) (10000,)
Labels:  [0 1 2 3 4 5 6 7 8 9]


In [4]:
# the dataset is too large, we need do a subsample.
num_training = 5000
num_test = 1000

idx_train = np.random.randint(0, y_train.shape[0], num_training)
idx_test = np.random.randint(0, y_test.shape[0], num_test)
X_train = X_train[idx_train]
y_train = y_train[idx_train]

X_test = X_test[idx_test]
y_test = y_test[idx_test]

print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(5000, 32, 32, 3) (5000,) (1000, 32, 32, 3) (1000,)


In [5]:
# reshape the image data
X_train1 = np.reshape(X_train, (X_train.shape[0], -1))
X_test1 = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train1.shape, X_test1.shape)

(5000, 3072) (1000, 3072)


In [6]:
# test normal KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold

# 训练数据： （X_train1, y_train）, 测试数据：(X_test1, y_test)
params_k = [1,3,5,7,9,12]  # 可以选择的K值
# the p = 3 of the Minkowski Distance is very very slow for KNN,so we choose p = 2
params = {'n_neighbors' : params_k}

# 构建模型
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
knn = KNeighborsClassifier() # ! Do not specify any parameters for the constructor
model = GridSearchCV(knn, params,n_jobs= -1, cv = kf, verbose = 2) # 3-fold-cross-validation
model.fit(X_train1, y_train)

# print best parameters
print(model.best_params_)

# result
print("Accuracy in the test data set is: %.2f"%model.score(X_test1, y_test))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   45.9s remaining:   40.2s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.4min finished


{'n_neighbors': 1}
Accuracy in the test data set is: 0.25


In [10]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.24      0.24       109
           1       0.52      0.12      0.20        98
           2       0.16      0.32      0.21       104
           3       0.18      0.13      0.15        97
           4       0.20      0.35      0.26       113
           5       0.26      0.19      0.22        99
           6       0.23      0.22      0.22        96
           7       0.27      0.15      0.19        87
           8       0.35      0.58      0.44       102
           9       0.50      0.11      0.17        95

    accuracy                           0.25      1000
   macro avg       0.29      0.24      0.23      1000
weighted avg       0.29      0.25      0.23      1000



In [11]:
# normalize the data before PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train1)
X_train_normal = scaler.transform(X_train1)
X_test_normal = scaler.transform(X_test1)

In [35]:
# we done PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 32, svd_solver='full')
X_train3 = pca.fit_transform(X_train_normal)
# we do not need to re-fit the test data again in the same PCA model.
X_test3 = pca.transform(X_test_normal)

In [36]:
# retrain the data by KNN
# test normal KNN
clf = KNeighborsClassifier(n_neighbors=10, p = 2)
clf.fit(X_train3, y_train)

eucilidean_acc = clf.score(X_test3, y_test)
print('accuracy on test set of {} points: {:.4f}'.format(X_test3.shape[0], eucilidean_acc))

accuracy on test set of 1000 points: 0.3150


In [40]:
# train with mahalanobis
# Instantiate the metric learner
from pylmnn import LargeMarginNearestNeighbor as LMNN
lmnn = LMNN(n_neighbors=10, max_iter=200, n_components=X_train1.shape[1])

# Train the metric learner
lmnn.fit(X_train1, y_train)

# Fit the nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=10, p = 2)
knn.fit(lmnn.transform(X_train1), y_train)

# Compute the k-nearest neighbor test f1-score after applying the learned transformation
lmnn_acc = knn.score(lmnn.transform(X_test1), y_test)
print('LMNN accuracy on test set of {} points: {:.4f}'.format(X_test1.shape[0], lmnn_acc))

LMNN accuracy on test set of 1000 points: 0.3010
