In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import tensorflow.keras as tk
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
fashion_mnist = tk.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()

In [3]:
x_train = np.reshape(X_train_full, (60000, 28*28))

In [4]:
x_train = x_train/255

In [5]:
x_train.shape

(60000, 784)

In [6]:
pca = PCA(n_components=0.95)
x_reduced = pca.fit_transform(x_train)

In [None]:
k_range = list(range(1, 17))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_reduced, y_train_full)
    pred = knn.predict(x_reduced)
    scores.append(metrics.accuracy_score(y_train_full, pred))
    
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

K=1 would be over-fitting and k=2 won't be good (if there are 2 neighbors and each one says something different, it would be hard to decide).
Thus, it seems that the best choice is k=3.
To be sure, we'll do another check, this time- using KFold and checking the mean score of each k and its std.

In [None]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)

In [None]:
k_range = list(range(1, 17))
k_mean_scores = []
k_std_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_reduced, y_train_full, cv=kf, scoring='accuracy')
    k_mean_scores.append(scores.mean())
    k_std_scores.append(scores.std())

d = {'k_mean': k_mean_scores, 'k_std': k_std_scores}
Ks = pd.DataFrame(data = d, index=k_range)

In [None]:
Ks['k_mean'].sort_values(ascending=False).head(10)

In [None]:
Ks['k_std'].sort_values(ascending=True)

we'll see that we can't be sure which k is the best to take, so we'll try to decide by doing Grid search

In [None]:
knn=KNeighborsClassifier()
k_range = list(range(3,17,2))
print(k_range)
weight_options = ['uniform', 'distance']
metrics_options = ['euclidean', 'manhattan']
param_grid = dict(n_neighbors=k_range, weights=weight_options, metric=metrics_options)
grid=GridSearchCV(knn, param_grid, cv=kf, scoring='accuracy')
print("getting to fit")
grid.fit(x_reduced, y_train_full)
print(grid.best_score_)
print(grid.best_params_)

As we can see from the GridSearch, the best k to choose is k=5

In [None]:
our_best_knn = KNeighborsClassifier(n_neighbors=5, metric='manhattan', weights='distance')

In [None]:
random_for = RandomForestClassifier()
randomfor_score = cross_val_score(random_for, x_reduced, y_train_full, cv=kf, scoring='accuracy').mean()
print(randomfor_score)

In [None]:
from sklearn.naive_bayes import GaussianNB
naive_bay = GaussianNB()
bayes_score = cross_val_score(naive_bay, x_reduced, y_train_full, cv=kf, scoring='accuracy').mean()
print(bayes_score)

In [None]:
from sklearn.linear_model import LogisticRegression
logis_reg = LogisticRegression(max_iter=2000, multi_class='multinomial')
reg_score = cross_val_score(logis_reg, x_reduced, y_train_full, cv=kf, scoring='accuracy').mean()
print(reg_score)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adac = AdaBoostClassifier(base_estimator=logis_reg, n_estimators=200, learning_rate=1.0, algorithm='SAMME.R', random_state=42)
adac.fit(X_traina, y_traina)
adac_score = cross_val_score(adac, x_reduced, y_train_full, cv=kf, scoring='accuracy').mean()
print(adac_score)

We can see that the KNN model gives us the best results. Thus, we'll choose it to be our final model.

In [None]:
def get_data_ready(data):
    re_data = np.reshape(data, (len(data), 28*28))
    re_data = re_data/255
    final_data = pca.transform(re_data)
    return final_data

In [None]:
test = get_data_ready(X_test)

In [None]:
our_best_knn.fit(x_reduced, y_train_full)
final_predict = our_best_knn.predict(test)
print('Final accuracy score of model is: ', accuracy_score(y_test,final_predict))