In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Business Problem:
- To use a classification model to correctly classify the digits given in the mnist dataset.
- We will use **SVM for classification** and **PCA,t-SNE for visualization and interpretability**.

### Importing required packages:

In [None]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, metrics, svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt, matplotlib.image as mpimg
from sklearn import svm

from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


### Loading train data into Dataframe:

In [None]:
mnist_train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [None]:
# print first five rows of mnist_train.

mnist_train.head()

In [None]:
mnist_train.shape

# PCA implementations from first principles:

In [None]:
# save the labels into a variable l.
l = mnist_train['label']
l.shape

In [None]:
l.value_counts().values.sum()

In [None]:
# Drop the label feature and store the pixel data in d.
d = mnist_train.drop("label",axis=1)

In [None]:
# display or plot a number.
plt.figure(figsize=(5,5))
idx = 100
grid_data = d.iloc[idx].to_numpy().reshape(28,28)  # reshape from 1d to 2d pixel array
plt.imshow(grid_data,  cmap = "gray")
plt.show()
print(l[idx])

In [None]:
#2D-Visualization
# Pick first 15K data-points to work on for time-effeciency.

labels = l.head(15000)
data = d.head(15000)
print("the shape of sample data = ", data.shape)



In [None]:
# Data-preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
standardized_data = StandardScaler().fit_transform(data)
print(standardized_data.shape)

In [None]:
#find the co-variance matrix which is : A^T * A
sample_data = standardized_data
# matrix multiplication using numpy
covar_matrix = np.matmul(sample_data.T , sample_data)
print ( "The shape of variance matrix = ", covar_matrix.shape)


In [None]:
# finding the top two eigen-values and corresponding eigen-vectors 
# for projecting onto a 2-Dim space.
from scipy.linalg import eigh 
# the parameter 'eigvals' is defined (low value to heigh value) 
# eigh function will return the eigen values in asending order
# this code generates only the top 2 (782 and 783) eigenvalues. As largest two eigen values will be 782 and 783.
values, vectors = eigh(covar_matrix, eigvals=(782,783))
print("Shape of eigen vectors = ",vectors.shape)
print(values)


In [None]:
#vectors[:,0] represents the eigen vector corresponding to the 2nd eigen value.(First col in vectors matrix)
#vectors[:,1] represents the eigen vector correspondign to the 1st eigen value.(Second col in vectors matrix)
#Note : Eigen values are arranged in ascending order so the Eigen vectors too.
# converting the eigen vectors into (2,d) shape for ease of computation which we do ----> we will get matrix of (2 * 784)
vector = vectors.T
print("Updated shape of eigen vectors = ",vector.shape)
# Here, vectors[0] represent the eigen vector corresponding to the 2nd eigen value.
# Here, vectors[1] represent the eigen vector corresponding to the 1st eigen value.

In [None]:
#Now, we need to swap the rows of the vector matrix 
#such that the first row corresponds to eigen vector with the largest eigen value
#and second row corresponds to the eigen vector with second largest eigen value.
vector[[0,1]]=vector[[1,0]]

In [None]:
# projecting the original data onto the eigen basis.
# Basically, we form a matrix with the eigen vectors in row order. 
#Then, we do a matrix multiplication between updated vector and transose of sample data
import matplotlib.pyplot as plt
new_coordinates = np.matmul(vector, sample_data.T)
print (" resultant new data points' shape ", vector.shape, "X", sample_data.T.shape, " = ", new_coordinates.shape)


In [None]:
# appending label to the 2d projected data
new_coordinates = np.vstack((new_coordinates, labels)).T
# creating a new data frame for ploting the labeled points.
dataframe = pd.DataFrame(data=new_coordinates, columns=("1st_principal", "2nd_principal", "label"))
print(dataframe.head())

In [None]:
# ploting the 2d data points with seaborn
# Note : This is 2d representation of 784 dimensions data.
import seaborn as sn
sn.FacetGrid(dataframe, hue="label", size=7).map(plt.scatter, '1st_principal', '2nd_principal').add_legend()
plt.show()

# PCA using Scikit-Learn


In [None]:
# initializing the pca
from sklearn import decomposition
pca = decomposition.PCA()

In [None]:
# configuring the parameteres
# the number of components = 2
pca.n_components = 2
pca_data = pca.fit_transform(sample_data)

In [None]:
# pca_reduced will contain the 2-d projects of simple data
print("shape of pca_reduced.shape = ", pca_data.shape)


In [None]:
# attaching the label for each 2-d data point 
pca_data = np.vstack((pca_data.T, labels)).T

In [None]:
# creating a new data fram which help us in ploting the result data
pca_df = pd.DataFrame(data=pca_data, columns=("1st_principal", "2nd_principal", "labels"))
sn.FacetGrid(pca_df, hue="labels", size=6).map(plt.scatter, '1st_principal', '2nd_principal')
plt.show()

# Observation and Inferences from PCA: 
- Note : This is 2d representation of 784 dimensions data.
- Imagine, we want to build a classifier in the 784 dimensions data. What can be the insights??:
    - The dark blue points are grouped very nicely (0.0 label)on extreme right side.
    - Likewise, the points with label 7 are groped at extreme left side.
    - This we can easily build some sort of clusters/classification planes to classsify all likely points.
- Since, 2-dimensions is an approximation. We are lossing so much information but still it gives me some understanding that probably in the high dimensions (784) which we can't even imagine/visualize, all our points with label 0.0 are close to each other, 9's are groped togesther. Which all the images corresponding to their class labels are grouped together.

# PCA for dimensionality reduction (not for visualization)


In [None]:
# PCA for dimensionality redcution (non-visualization)
pca.n_components = 784
pca_data = pca.fit_transform(sample_data)
percentage_var_explained = pca.explained_variance_ / np.sum(pca.explained_variance_)
cum_var_explained = np.cumsum(percentage_var_explained)

In [None]:
# Plot the PCA spectrum
plt.figure(1, figsize=(6, 4))
plt.clf()
plt.plot(cum_var_explained, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.show()

# Observation and Inferences from Variance explained by PCA: 
- As we can see we have 784 dimensions which cumulatively explains 100% of the data.
- If we take first 100 dimensions, they explains ~75% of the data.
- As we have plotted in 2 dimesnions for scatterplot. It's clearly visible that with 2-d we are able to explain 20% of the data.Still loosing 80% but we have some way to visualize (**Half Bread is better than No Bread :p**)  
- But we can conclude that out of 784 features, only ~400 can explain almost all of my data, and around 300 can explain approx 90% of our data. which is great!!!

# t-Distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
# TSNE
from sklearn.manifold import TSNE
# Picking the top 1000 points as TSNE takes a lot of time for 15K points
data_1000 = standardized_data[0:1000,:]
labels_1000 = labels[0:1000]


In [None]:
model = TSNE(n_components=2, random_state=0)
# configuring the parameteres
# the number of components = 2
tsne_data = model.fit_transform(data_1000)

In [None]:
# creating a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_1000)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("Dim_1", "Dim_2", "label"))

In [None]:
# Ploting the result of tsne
sn.FacetGrid(tsne_df, hue="label", size=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
plt.show()

# Observation and Inferences from t-SNE:
- Better visualization than PCA
- More clarity in terms of interpretations ex: generally 4 and 9 are written similarly and hence we can see the points with label 4 (light blue) and 9(purple) are very closely grouped.
- Now we have understood the data well, we can proceed with model building....

# Model Building with SVM

In [None]:
# We will choose first 5K data-points to work on for time-effeciency.
full_images = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
images = full_images.iloc[0:5000,1:]
labels = full_images.iloc[0:5000,:1]
X_train, X_test,y_train, y_test = train_test_split(images, labels, train_size=0.8, random_state=0)

In [None]:
# display or plot a number.
plt.figure(figsize=(5,5))
idx = 100
grid_data = X_train.iloc[idx].to_numpy().reshape(28,28)  # reshape from 1d to 2d pixel array
plt.imshow(grid_data,  cmap = "gray")
plt.show()
print(l[idx])

### Observations:
- Note that these images aren't actually black and white (0,1). They are gray-scale (0-255).

In [None]:
# Visualizing data with histplot

plt.hist(X_train.iloc[idx])

### Observations:
- Since this is a graysclae image, we are getting range of value from 0-255

# Model Training:
- We will use svm's classfier called SVC for building a vector classifier

# Model -1 : Linear SVM Model

In [None]:
# SVC

from sklearn.model_selection import GridSearchCV

params = {
        'C': [1e-4,  0.001, 0.01, 0.1, 1,10] 
        }
svc = SVC(kernel='linear')
clf = GridSearchCV(svc, params, scoring = "accuracy", cv=3)

clf.fit(X_train, y_train)

In [None]:
res = clf.cv_results_

for i in range(len(res["params"])):
    print(f"Parameters:{res['params'][i]} Mean_score: {res['mean_test_score'][i]} Rank: {res['rank_test_score'][i]}")

- As we have seen, change in hyperparam does not have any effect on the score and hence we will go with default C=1

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)# learning / fitting from train data
clf.score(X_test,y_test) # Evaluating score on test data

### Observations:
- With Linear kernel, our accuracy is 0.91

# Model -2 : RBF SVM

In [None]:
# by defualt, svm.SVC() uses kernel == RBF, we will train our model on it.

clf_1 = svm.SVC()
clf_1.fit(X_train, y_train.values.ravel()) # learning / fitting from train data
clf_1.score(X_test,y_test) # Evaluating score on test data

### Observations:
- With RBF kernel, our accuracy increased to 0.942

# Model -3 : RBF SVM with binary cmap
- Converting grayscale to binary with a simple logic:
- To check if we have any improvement on our result.

In [None]:
X_test_copy = X_test.copy()
X_train_copy = X_train.copy()

In [None]:
X_test_copy[X_test_copy>0]=1
X_train_copy[X_train_copy>0]=1

# img=train_images.iloc[idx].to_numpy().reshape((28,28))
# plt.imshow(img,cmap='binary')
# plt.title(train_labels.iloc[idx])

# display or plot a number.
plt.figure(figsize=(5,5))
idx = 100
grid_data = X_train_copy.iloc[idx].to_numpy().reshape(28,28)  # reshape from 1d to 2d pixel array
plt.imshow(grid_data,  cmap = "binary")
plt.show()
print(l[idx])

In [None]:
# Visualizing data with histplot

plt.hist(X_train_copy.iloc[idx])

### Observations:
- Since this is a binary image, we are getting values 0 or 1 for our train dataset.

In [None]:
#Retraining our model

clf_2 = svm.SVC()
clf_2.fit(X_train_copy, y_train.values.ravel())
clf_2.score(X_test_copy,y_test)

### Observations:
- We are not getting any improvement in the scores and hence we will got with our second model (clf_1)

# Predictions on test data:

In [None]:
test_data=pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
test_data.shape

In [None]:
final_predictions=clf_1.predict(test_data)

In [None]:
final_predictions

In [None]:
df = pd.DataFrame(final_predictions)
df.index.name='ImageId'
df.index+=1
df.columns=['Label']
df.to_csv('final_predictions.csv', header=True)

# Please Upvote if you like the Notebook Content !