In [85]:
from __future__ import division
from mnist import MNIST
from sklearn import tree
import numpy as np
from sklearn.metrics import classification_report,log_loss,accuracy_score,roc_auc_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from time import time
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
import re
from sklearn.preprocessing import label_binarize



In [76]:
# Loading the data
mndata = MNIST('./')
images_train, labels_train = mndata.load_training()
images_train = np.array(images_train)
labels_train = np.array(labels_train)

images_test, labels_test = mndata.load_testing()
images_test = np.array(images_test)
labels_test = np.array(labels_test)



In [77]:
def classifaction_report_to_dataframe(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('      ')
        if len(row_data)>4:
            row['class'] = row_data[1]
            row['precision'] = float(row_data[2])
            row['recall'] = float(row_data[3])
            row['f1_score'] = float(row_data[4])
            row['support'] = float(row_data[5])
            report_data.append(row)
    for line in lines[-4:-1]:
        row = {}
        p = re.compile("[a-z]+ [a-z]+")
        m = p.search(line)  
        row['class'] = m.group(0)
        p = re.compile("\d+.?\d+")
        row_data = p.findall(line)
        row['precision'] = float(row_data[0])
        row['recall'] = float(row_data[1])
        row['f1_score'] = float(row_data[2])
        row['support'] = float(row_data[3])
        report_data.append(row)
        
        
    dataframe = pd.DataFrame.from_dict(report_data)
    return dataframe


In [78]:
def get_landmark_points(n,data):
    landmark_points  = data[np.random.choice(data.shape[0], n, replace=False),:]
    return landmark_points
    

In [79]:
def gaussian_kernel_distance(x,y):
    return np.exp(-0.1 * (np.linalg.norm(x-y)) **2)

In [80]:
def transform_all_input_data(input_data,landmark_points):
    new_input_data = np.zeros((input_data.shape[0], landmark_points.shape[0]))
    rows = input_data.shape[0]
    cols = landmark_points.shape[0]
    i = 0
    j = 0
    for i in range(0,rows):
        for j in range(0,cols):
            new_input_data[i,j] = gaussian_kernel_distance(input_data[i,:],landmark_points[j,:])
    return new_input_data


In [81]:
no_of_landmark = [1,2,5,10,20,30,50,100,200,300,500]
no_neighbours = [1,2,5,10]
# no_neighbours = [1]
# no_of_landmark = [1,2,5]


In [82]:
excel_file_name = 'KNN_Landmark_' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + '.xlsx'



excel_writer = pd.ExcelWriter(excel_file_name, engine='openpyxl')
df = pd.DataFrame(columns=['Test'])
df.to_excel(excel_writer, sheet_name='demo_sheet')

excel_writer.save()

  self.book.remove_sheet(self.book.worksheets[0])


In [83]:
df = pd.DataFrame(columns=['Number of landmark points','Number of neighbours','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

In [86]:
for l in no_of_landmark:
    for n in no_neighbours:        
        landmark_points = get_landmark_points(l,images_train)
        np.array(landmark_points)
        new_input_data = transform_all_input_data(images_train,landmark_points)
        new_test_data = transform_all_input_data(images_test,landmark_points)
        neigh = KNeighborsClassifier(n_neighbors=n)
        
        
        start = time()
        neigh.fit(new_input_data, labels_train)
        train_time = time() - start
        
        start = time()
        predictions = neigh.predict(new_test_data)
        test_time = time() - start
                                    
        accuracy = accuracy_score(predictions,labels_test,normalize=False)                                    
        normalized_accuracy = accuracy_score(predictions,labels_test,normalize=True)
        log_loss_prediction = neigh.predict_proba(new_test_data)
        log_loss_val = log_loss(labels_test,log_loss_prediction) 
        classfication_repo = classification_report(labels_test, predictions)
    

        labels_train_one_hot=label_binarize(labels_test,classes=[0,1,2,3,4,5,6,7,8,9])
        predictions_one_hot=label_binarize(predictions,classes=[0,1,2,3,4,5,6,7,8,9])
        micro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="micro")
        macro_auc=roc_auc_score(labels_train_one_hot,predictions_one_hot,average="macro")
        
        df_data = [l,n,accuracy,normalized_accuracy,train_time,test_time,log_loss_val,macro_auc,micro_auc]
        '''
        
        
        df = pd.DataFrame(columns=['Number of landmark points','Number of neighbours','Accuracy','Normalized Accuracy', 
                 'Time_taken_train','Time_taken_test',
                 'Negative Log loss','Macro_auc','Micro_auc'])

        '''
        df.loc[len(df)] = df_data
        
        cm = confusion_matrix(labels_test,predictions)
        cm_df = pd.DataFrame(data=cm)
        
        print(l,n,accuracy)
        
        book = load_workbook(excel_file_name)
        with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
            excel_writer.book = book
            df_classification_report = classifaction_report_to_dataframe(classfication_repo)
            df_classification_report.to_excel(excel_writer, sheet_name='knn_landmark_cr' + '_'+ str(l) + '_'+ str(n))
            cm_df.to_excel(excel_writer, sheet_name='knn_landmark_cr' + '_'+ str(l) + '_'+ str(n),
                               startrow=len(df_classification_report)+5)

            excel_writer.save()    
    
book = load_workbook(excel_file_name)
with pd.ExcelWriter(excel_file_name, engine='openpyxl') as excel_writer:
    excel_writer.book = book
    df.to_excel(excel_writer, sheet_name='knn_landmark')
    excel_writer.save()       


  'precision', 'predicted', average, warn_for)
  self.book.remove_sheet(self.book.worksheets[0])


1 1 1010


  'precision', 'predicted', average, warn_for)
  self.book.remove_sheet(self.book.worksheets[0])


2 1 1010


  'precision', 'predicted', average, warn_for)
  self.book.remove_sheet(self.book.worksheets[0])


5 1 1010


  self.book.remove_sheet(self.book.worksheets[0])
