In [7]:
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
import cv2
from sklearn.tree import DecisionTreeClassifier

In [34]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
# (x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype('uint8')   
x_test = x_test.astype('uint8')

x_train.shape


x_train, _, y_train, _ = train_test_split(x_train, y_train, test_size=0.95)

x_test, _, y_test, _ = train_test_split(x_test, y_test, test_size=0.95)


In [35]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3000, 28, 28)
(3000,)
(500, 28, 28)
(500,)


In [36]:
x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)
x_train = x_train.astype('float32')   
x_test = x_test.astype('float32')

In [37]:
# Train a Decision Tree on the data (Decision tree uses IG for the splits)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(x_train, y_train)
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]
indices[0]

350

In [38]:
N = 100  # Change this to select number of features based on IG
selected_features = indices[:N]
print(selected_features)
# Apply the selection to training and test data
x_train_selected = x_train[:, selected_features]
x_test_selected = x_test[:, selected_features]


[350 489 570 435 271 211 486 432 234 657 353 156 624 655 296 377 290 544
 329 243 157 210 490 466 265 297 204 190 294 347 154 267 433 384 274 352
 607 456 176 656 355 379 240 459 598 457 321 515 550 188 629 680 400 263
 599 428 654 155 178 467 526 403 373 595 344 659 291 412 214 151 269 406
 185 317 295 485 371 488 189 491 427 152 401 431 541 465 383 293 567 573
 518 124 326 180 658 247 158 660  98 436]


In [40]:
kVals = [1, 5, 10, 20, 30]
accuracies = []

for k in kVals:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train_selected, y_train)
    score = model.score(x_test_selected, y_test)
    print(f"k={k}, accuracy={score * 100:.2f}%")
    accuracies.append(score)

k=1, accuracy=91.00%
k=5, accuracy=91.00%
k=10, accuracy=90.80%
k=20, accuracy=88.40%
k=30, accuracy=88.00%


In [41]:
kVals = [5,10,20,30,50,100]
for k in kVals:
    count = 0
    for i in range(len(x_test_selected)):
        x = x_test_selected[i]

        distance_matrix = np.argsort(abs(x - x_train_selected), axis=0)
        k_neighbor_labels = np.array([])
        k_star = k
        for l in range(len(x)):
            if k_star <= 0: break

            distances_along_axis = np.array(distance_matrix[:,l][:k_star])

            k_neighbor_labels = np.concatenate([k_neighbor_labels,distances_along_axis])
            k_star -= 2
        
        labels = y_train[np.array(k_neighbor_labels).flatten().astype(int)]

        most_common = np.bincount(labels).argmax()
        if most_common == y_test[i]:
            count += 1
    score = count / len(x_test_selected)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))

k=5, accuracy=22.40%
k=10, accuracy=28.80%
k=20, accuracy=40.20%
k=30, accuracy=48.20%
k=50, accuracy=53.00%
k=100, accuracy=60.00%


In [42]:
kVals = [5,10,20,30,50,100, 150]
for k in kVals:
    count = 0
    for i in range(len(x_test_selected)):
        x = x_test_selected[i]
        k_neighbor_labels = np.array([])
        k_star = k
        for l in range(len(selected_features)):
            if k_star <= 0: break
            imp_feature_neighbours = np.array([l, l+1, l+27, l+28, l+29, l-1, l-27, l-28, l-29]) # this is for neighbour pixesl
            mask = imp_feature_neighbours < 100
            imp_feature_neighbours = imp_feature_neighbours[mask]
            distances = np.linalg.norm(x[imp_feature_neighbours] - x_train_selected[:,imp_feature_neighbours], axis=1)
            distances_along_axis = np.argsort(distances)[:k_star]
            k_neighbor_labels = np.concatenate([k_neighbor_labels,distances_along_axis])
            
            # k_star = k_star // 2
            k_star -= 2
        
        labels = y_train[np.array(k_neighbor_labels).flatten().astype(int)]

        most_common = np.bincount(labels).argmax()
        if most_common == y_test[i]:
            count += 1
    score = count / len(x_test_selected)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))

k=5, accuracy=66.00%
k=10, accuracy=73.60%
k=20, accuracy=80.20%
k=30, accuracy=83.20%
k=50, accuracy=84.60%
k=100, accuracy=84.80%
k=150, accuracy=85.00%


In [44]:
covariance_matrix = np.cov(x_train_selected, rowvar=False)
num_top_features = 20
top_features_indices = np.argsort(-np.abs(covariance_matrix), axis=1)[:, 1:num_top_features + 1]

# kVals = [1, 2,3,5,6,7,8,10, 15, 20, 30, 50, 100]
kVals = [5,10,20,30,50,100, 150]
for k in kVals:
    count = 0
    for i in range(len(x_test_selected)):
        x = x_test_selected[i]
        k_neighbor_labels = np.array([])
        k_star = k
        for l in range(len(selected_features)):
            if k_star <= 0: break
            # distances = [np.linalg.norm(x - data_point) for data_point in x_train]
            imp_feature_neighbours = top_features_indices[l] # THIS IS FOR COVARIANCE
            # imp_feature_neighbours = np.array([l, l+1, l+27, l+28, l+29, l-1, l-27, l-28, l-29]) # this is for neighbour pixesl
            # mask = imp_feature_neighbours <= 783
            # imp_feature_neighbours = imp_feature_neighbours[mask]
            distances = np.linalg.norm(x[imp_feature_neighbours] - x_train_selected[:,imp_feature_neighbours], axis=1)
            distances_along_axis = np.argsort(distances)[:k_star]
            k_neighbor_labels = np.concatenate([k_neighbor_labels,distances_along_axis])
            # k_star = k_star // 2
            k_star -= 2
        
        labels = y_train[np.array(k_neighbor_labels).flatten().astype(int)]

        most_common = np.bincount(labels).argmax()
        if most_common == y_test[i]:
            count += 1
    score = count / len(x_test_selected)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))

k=5, accuracy=76.20%
k=10, accuracy=85.40%
k=20, accuracy=87.20%
k=30, accuracy=88.20%
k=50, accuracy=88.80%
k=100, accuracy=87.20%
k=150, accuracy=85.40%
