In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Standard Imports

In [None]:
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
tst = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")
sub = pd.read_csv("/kaggle/input/digit-recognizer/sample_submission.csv")
trn = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")

In [None]:
tst.head()

In [None]:
trn.head()

In [None]:
sub.head()

In [None]:
print("Train",trn.shape)
print("Test",tst.shape)
print("Submission",sub.shape)

### Store the Feature Column Names (except Label) in Variable

In [None]:
feat_cols = [ 'pixel'+str(i) for i in range(tst.shape[1]) ] # convert num to pixelnum for column names
len(feat_cols)

In [None]:
trn.columns

### Basically does the same in a different way

In [None]:
def subtract_lists(x,y):
    """Subtract Two Lists (List Difference)"""
    return [item for item in x if item not in y]
feat = subtract_lists(list(trn.columns),["label"])
len(feat)

In [None]:
X = pd.concat([trn[feat_cols],tst],axis=0) # trn.iloc[:].iloc[1:]

In [None]:
X.shape

In [None]:
X.head()

### Visualize the Data

In [None]:
df = trn
rndperm = np.random.permutation(df.shape[0])     # random permutation to be used later for data viz

plt.gray()                                       # set the colormap to “gray”
fig = plt.figure( figsize=(20,9) )               # initilaize the figure with the figure size

for i in range(0,15):
    # use subplots to get 3x5 matrix of random handwritten digit images
    ax = fig.add_subplot(3,5,i+1, title="Digit: {}".format(str(df.loc[rndperm[i],'label'])) )
    ax.matshow(df.loc[rndperm[i],feat_cols].values.reshape((28,28)).astype(float))
    ax.set_xticks([])                             # set the xtciks and yticks as blanks
    ax.set_yticks([]) 

plt.savefig("MINIST_DIGITS.png",dpi=600)
plt.show() ;                                      # display the figure

### 70000 Number of Samples

In [None]:
data_subset = df[feat_cols].values  # get the numpy array of this dataframe and store it is subset data

### Customize Matlotlib

In [None]:
plt.style.use('dark_background')

In [None]:
univ_seed=42

### Apply TSNE

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300, random_state = univ_seed)
tsne_results_2D = tsne.fit_transform(X)

In [None]:
Shape_X = trn.shape[0]

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300, random_state = univ_seed)
tsne_results_3D = tsne.fit_transform(X)

# Save the Files

In [None]:
np.save("tsne_results_2D.npy",tsne_results_2D)
np.save("tsne_results_3D.npy",tsne_results_3D)

In [None]:
y_labels = list(trn["label"]) + [np.nan]*tst.shape[0]

In [None]:
reduced_df=pd.DataFrame(np.c_[y_labels ,tsne_results_2D[:,0], tsne_results_2D[:,1]], 
                        columns=['y','tsne-2d-one','tsne-2d-two' ])
reduced_df['tsne-3d-one']=tsne_results_3D[:,0]
reduced_df['tsne-3d-two']=tsne_results_3D[:,1]
reduced_df['tsne-3d-three']=tsne_results_3D[:,2]
reduced_df.head()

In [None]:
reduced_df.tail()

### Plot the 2-Dimensional & 3-Dimensional Plots

In [None]:
fig=plt.figure(figsize=(16,10))


reduced_df_sorted=reduced_df.dropna().sort_values(by='y', ascending=True).sample(n = 10000,random_state=univ_seed)

sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("tab10", 10), # hls, rocket, icefire , Spectral
    data=reduced_df_sorted,
    legend="full",
    alpha=1
)



plt.legend(title="Target Digits (y)")
plt.title("t-SNE Plot for MNIST Handwritten Digit Classification",fontsize=20)
plt.savefig("t-SNE Plot for MNIST Handwritten Digit Classification_custom1.png",dpi=300)

In [None]:
import plotly.express as px

df_sampled= reduced_df.dropna().sample(n = 500,random_state=univ_seed)
df_sampled_sorted=df_sampled.sort_values(by='y', ascending=True)

fig = px.scatter_3d(df_sampled_sorted, x='tsne-3d-one', y='tsne-3d-two', z='tsne-3d-three',
                    color='y', template="plotly_dark",color_continuous_scale=px.colors.sequential.Plasma) # .Viridis

fig.write_html("MNIST_Handwritten_Digits_Dataset_tSNE_3D_Viz.html")
fig.show()

In [None]:
# X=reduced_df[["tsne-2d-one", "tsne-2d-two"]].values
reduced_df_train = reduced_df.dropna()
X_train3D=reduced_df_train[["tsne-3d-one", "tsne-3d-two", "tsne-3d-three"]].values
y_train3D=reduced_df_train["y"].values
print("X_train3D Shape : ", X_train3D.shape , "y_train3D Shape : ", y_train3D.shape)

reduced_df_test = reduced_df.loc[~reduced_df.index.isin(reduced_df.dropna().index)]
X_test3D=reduced_df_test [["tsne-3d-one", "tsne-3d-two", "tsne-3d-three"]].values
y_test3D=reduced_df_test ["y"].values
print("X_test3D Shape : ", X_test3D.shape , "y_test3D Shape : ", y_test3D.shape)

In [None]:
#train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X_train3D, y_train3D, test_size=0.2, random_state=univ_seed)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 40+1
mean_acc = np.zeros((Ks-1))
mean_acc_train= np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
std_acc_train = np.zeros((Ks-1))
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat) #gets the test accuracy
    y_pred=neigh.predict(X_train)
    mean_acc_train[n-1] = metrics.accuracy_score(y_train,y_pred) #gets the train accuracy
    
    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
    std_acc_train[n-1]=np.std(y_pred==y_train)/np.sqrt(y_pred.shape[0])
print("MEAN ACCURACY")
length=len(mean_acc)
for i in range(length):
    test_acc='{0:.3f}'.format(round(mean_acc[i],3))
    train_acc='{0:.3f}'.format(round(mean_acc_train[i],3))
    
    print("K=",f"{i+1:02d}","  Avg. Test Accuracy=",test_acc,"  Avg. Train Accuracy=",train_acc) 

In [None]:

print( "The best test accuracy was", mean_acc.max(), "with k=", mean_acc.argmax()+1)
print( "The corresponding training accuracy obtained was :",mean_acc_train[mean_acc.argmax()])

plt.figure(figsize=(15,7.5))
#comment the figure sizeif you want a small figure size
plt.plot(range(1,Ks),mean_acc_train,'r',linewidth=5)
plt.plot(range(1,Ks),mean_acc,'g',linewidth=5)
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc_train - 1 * std_acc_train,mean_acc_train + 1 * std_acc_train, alpha=0.10)


plt.scatter( mean_acc.argmax()+1,  mean_acc.max())
plt.scatter( mean_acc.argmax()+1,  mean_acc_train[mean_acc.argmax()])
#plt.annotate("BEST_TEST_ACC", ( mean_acc.argmax()+1,  mean_acc.max()))
#plt.annotate("CORRESPONDING_TRAIN_ACC", ( mean_acc.argmax()+1,  mean_acc_train[mean_acc.argmax()]))

plt.legend(('Train_Accuracy ','Test_Accuracy ', '+/- 3xstd_test','+/- 3xstd_train','BEST_TEST_ACC','CORRESPONDING_TRAIN_ACC'))

plt.xticks(ticks=list(range(Ks)),labels=list(range(Ks)) )
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.title("Number of Neigbors Chosen vs Mean Training and Testing Accuracy Score",fontsize=20)
plt.tight_layout()


plt.savefig("Number of Neigbors Chosen vs Mean Training and Testing Accuracy Score.png",dpi=600)
plt.show()

#this plot clearly shows that initially the model does overfit

In [None]:
#First,we keep a dictionary that measures all the losses/scores for our model/classifier
Test_Scores={}
Train_Scores={}


#Now evaluate the model based on metrics
#First import scoring methods
from sklearn.metrics import  accuracy_score, f1_score, confusion_matrix,precision_score, recall_score

from sklearn.metrics import jaccard_score as jaccard_similarity_score
#reconstruct the best model as last model is only saved. Previous models were overwritten
best_k=mean_acc.argmax()+1  #7
neigh = KNeighborsClassifier(n_neighbors = best_k).fit(X_train,y_train)
yhat=neigh.predict(X_test)
y_pred=neigh.predict(X_train)

#training scores
Train_Scores['KNN-jaccard']=jaccard_similarity_score(y_train, y_pred,average='weighted')
Train_Scores['KNN-f1-score']=f1_score(y_train, y_pred, average='weighted') 
Train_Scores['KNN-accuracy-score']=accuracy_score(y_train, y_pred)
Train_Scores['KNN-precision-score']=precision_score(y_train, y_pred,average='weighted')
Train_Scores['KNN-recall-score']=recall_score(y_train, y_pred,average='weighted')
print("Train Scores")
print(Train_Scores)

#testing scores

Test_Scores['KNN-jaccard']=jaccard_similarity_score(y_test, yhat,average='weighted')
Test_Scores['KNN-f1-score']=f1_score(y_test, yhat, average='weighted')
Test_Scores['KNN-accuracy-score']=accuracy_score(y_test, yhat) 
Test_Scores['KNN-precision-score']=precision_score(y_test, yhat, average='weighted') 
Test_Scores['KNN-recall-score']=recall_score(y_test, yhat, average='weighted') 
print("Test Scores")
print(Test_Scores)

cm=confusion_matrix(y_test, yhat)


cf_matrix=confusion_matrix(y_test, yhat)

side_of_cm=cf_matrix.shape[0]

group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v2}\n{v3}" for v2, v3 in
          zip(group_counts,group_percentages)]

labels = np.asarray(labels).reshape(side_of_cm,side_of_cm)

fig=plt.figure(figsize=(20,8))

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='inferno')



plt.xlabel("True Values",fontsize=18)
plt.ylabel("Predicted Values",fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title("Confusion Matrix for k-NN classifier for applied t-SNE\nMNIST Handwritten Digit Dataset",fontsize=20)

plt.savefig("Confusion Matrix for k-NN classifier for applied t-SNE MNIST Handwritten Digit Dataset_1.png",dpi=600)


In [None]:
best_k=mean_acc.argmax()+1  #7
neigh3D = KNeighborsClassifier(n_neighbors = best_k).fit(X_train3D,y_train3D)
yhat=neigh3D.predict(X_test3D)
y_pred=neigh3D.predict(X_train3D)

In [None]:
sub.head()

In [None]:
sub.shape

In [None]:
yhat.shape

In [None]:
np.unique(yhat)

In [None]:
from copy import deepcopy
sub3D = deepcopy(sub)
sub3D["Label"] = yhat.astype('uint8')

In [None]:
sub3D.head()

In [None]:
sub3D.to_csv('sub3D.csv',index=False)

In [None]:
# X=reduced_df[["tsne-2d-one", "tsne-2d-two"]].values
reduced_df_train = reduced_df.dropna()
X_train2D=reduced_df_train[["tsne-2d-one", "tsne-2d-two"]].values
y_train2D=reduced_df_train["y"].values
print("X_train2D Shape : ", X_train2D.shape , "y_train2D Shape : ", y_train2D.shape)

reduced_df_test = reduced_df.loc[~reduced_df.index.isin(reduced_df.dropna().index)]
X_test2D=reduced_df_test [["tsne-2d-one", "tsne-2d-two"]].values
y_test2D=reduced_df_test ["y"].values
print("X_test3D Shape : ", X_test2D.shape , "y_test3D Shape : ", y_test2D.shape)

In [None]:
#train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X_train2D, y_train2D, test_size=0.2, random_state=univ_seed)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 40+1
mean_acc = np.zeros((Ks-1))
mean_acc_train= np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
std_acc_train = np.zeros((Ks-1))
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat) #gets the test accuracy
    y_pred=neigh.predict(X_train)
    mean_acc_train[n-1] = metrics.accuracy_score(y_train,y_pred) #gets the train accuracy
    
    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
    std_acc_train[n-1]=np.std(y_pred==y_train)/np.sqrt(y_pred.shape[0])
print("MEAN ACCURACY")
length=len(mean_acc)
for i in range(length):
    test_acc='{0:.3f}'.format(round(mean_acc[i],3))
    train_acc='{0:.3f}'.format(round(mean_acc_train[i],3))
    
    print("K=",f"{i+1:02d}","  Avg. Test Accuracy=",test_acc,"  Avg. Train Accuracy=",train_acc) 

In [None]:

print( "The best test accuracy was", mean_acc.max(), "with k=", mean_acc.argmax()+1)
print( "The corresponding training accuracy obtained was :",mean_acc_train[mean_acc.argmax()])

plt.figure(figsize=(15,7.5))
#comment the figure sizeif you want a small figure size
plt.plot(range(1,Ks),mean_acc_train,'r',linewidth=5)
plt.plot(range(1,Ks),mean_acc,'g',linewidth=5)
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc_train - 1 * std_acc_train,mean_acc_train + 1 * std_acc_train, alpha=0.10)


plt.scatter( mean_acc.argmax()+1,  mean_acc.max())
plt.scatter( mean_acc.argmax()+1,  mean_acc_train[mean_acc.argmax()])
#plt.annotate("BEST_TEST_ACC", ( mean_acc.argmax()+1,  mean_acc.max()))
#plt.annotate("CORRESPONDING_TRAIN_ACC", ( mean_acc.argmax()+1,  mean_acc_train[mean_acc.argmax()]))

plt.legend(('Train_Accuracy ','Test_Accuracy ', '+/- 3xstd_test','+/- 3xstd_train','BEST_TEST_ACC','CORRESPONDING_TRAIN_ACC'))

plt.xticks(ticks=list(range(Ks)),labels=list(range(Ks)) )
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.title("Number of Neigbors Chosen vs Mean Training and Testing Accuracy Score",fontsize=20)
plt.tight_layout()


plt.savefig("Number of Neigbors Chosen vs Mean Training and Testing Accuracy Score.png",dpi=600)
plt.show()

#this plot clearly shows that initially the model does overfit

In [None]:
#First,we keep a dictionary that measures all the losses/scores for our model/classifier
Test_Scores={}
Train_Scores={}


#Now evaluate the model based on metrics
#First import scoring methods
from sklearn.metrics import  accuracy_score, f1_score, confusion_matrix,precision_score, recall_score

from sklearn.metrics import jaccard_score as jaccard_similarity_score
#reconstruct the best model as last model is only saved. Previous models were overwritten
best_k=mean_acc.argmax()+1  #7
neigh = KNeighborsClassifier(n_neighbors = best_k).fit(X_train,y_train)
yhat=neigh.predict(X_test)
y_pred=neigh.predict(X_train)

#training scores
Train_Scores['KNN-jaccard']=jaccard_similarity_score(y_train, y_pred,average='weighted')
Train_Scores['KNN-f1-score']=f1_score(y_train, y_pred, average='weighted') 
Train_Scores['KNN-accuracy-score']=accuracy_score(y_train, y_pred)
Train_Scores['KNN-precision-score']=precision_score(y_train, y_pred,average='weighted')
Train_Scores['KNN-recall-score']=recall_score(y_train, y_pred,average='weighted')
print("Train Scores")
print(Train_Scores)

#testing scores

Test_Scores['KNN-jaccard']=jaccard_similarity_score(y_test, yhat,average='weighted')
Test_Scores['KNN-f1-score']=f1_score(y_test, yhat, average='weighted')
Test_Scores['KNN-accuracy-score']=accuracy_score(y_test, yhat) 
Test_Scores['KNN-precision-score']=precision_score(y_test, yhat, average='weighted') 
Test_Scores['KNN-recall-score']=recall_score(y_test, yhat, average='weighted') 
print("Test Scores")
print(Test_Scores)

cm=confusion_matrix(y_test, yhat)


cf_matrix=confusion_matrix(y_test, yhat)

side_of_cm=cf_matrix.shape[0]

group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v2}\n{v3}" for v2, v3 in
          zip(group_counts,group_percentages)]

labels = np.asarray(labels).reshape(side_of_cm,side_of_cm)

fig=plt.figure(figsize=(20,8))

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='inferno')



plt.xlabel("True Values",fontsize=18)
plt.ylabel("Predicted Values",fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title("Confusion Matrix for k-NN classifier for applied t-SNE\nMNIST Handwritten Digit Dataset",fontsize=20)

plt.savefig("Confusion Matrix for k-NN classifier for applied t-SNE MNIST Handwritten Digit Dataset_1.png",dpi=600)


In [None]:
best_k=mean_acc.argmax()+1  #7
neigh2D = KNeighborsClassifier(n_neighbors = best_k).fit(X_train2D,y_train2D)
yhat=neigh2D.predict(X_test2D)
y_pred=neigh2D.predict(X_train2D)

In [None]:
from copy import deepcopy
sub2D = deepcopy(sub)
sub2D["Label"] = yhat.astype('uint8')

In [None]:
sub2D.to_csv('sub2D.csv',index=False)

In [None]:
import pickle

pickle.dump( neigh2D, open( "neigh2D.p", "wb" ) )
pickle.dump( neigh3D, open( "neigh3D.p", "wb" ) )

In [None]:
sub3D.to_csv('submission.csv',index=False)