# Principal Component Analysis on MNIST Data

Principal component analysis (PCA) is the process of computing the principal components and using them to perform a change of basis on the data, sometimes using only the first few principal components and ignoring the rest.

In [None]:
# Functions to read and show images.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

   
d0 = pd.read_csv('../input/digit-recognizer/train.csv')

print(d0.head(5)) # print first five rows of d0.

# save the labels into a variable l.
l = d0['label']

# Drop the label feature and store the pixel data in d.
d = d0.drop("label",axis=1)

In [None]:
print(d.shape)
print(l.shape)

In [None]:
# display or plot a number.
plt.figure(figsize=(7,7))
idx = 1

grid_data = d.iloc[idx].to_numpy().reshape(28,28)  # reshape from 1d to 2d pixel array
plt.imshow(grid_data, interpolation = "none", cmap = "gray")
plt.show()

print(l[idx])

#  2D Visualization using PCA 

In [None]:
#Pick first 15k data-points to work for time-efficiency

labels = l.head(15000)
data = d.head(15000)

print("The shape of sample data = ", data.shape)

In [None]:
#Data-preprocessing: Standardizing the data

from sklearn.preprocessing import StandardScaler
standardized_data  = StandardScaler().fit_transform(data)
print(standardized_data.shape)

In [None]:
#Find the co-variance matrix which is : A^T * A

sample_data = standardized_data

#Matrix multiplication using numpy
covar_matrix = np.matmul(sample_data.T, sample_data)

print("The shape of covariance matrix = ", covar_matrix.shape)

In [None]:
#Finding the top twp eigen-values and corrosponding eigen-vectors
#for projecting onto a 2-Dim space

from scipy.linalg import eigh

values, vectors = eigh(covar_matrix, eigvals=(782, 783))
print("Shape of eigen vectors = ", vectors.shape)

#Converting the eigen vectors into (2,d) shape for easyness for further computation
vectors = vectors.T

print("Updated shape of eigen vectors = ", vectors.shape)

In [None]:
#Projecting the original data sample on plane
#formed by two principal eigen vectors by vector-vector multiplication. 
import matplotlib.pyplot as plt
new_coordinates = np.matmul(vectors, sample_data.T)

print (" resultant new data points' shape ", vectors.shape, "X", sample_data.T.shape," = ", new_coordinates.shape)

In [None]:
import pandas as pd

#appending label to tge 2d projected data
new_coordinates = np.vstack((new_coordinates, labels)).T

#creating a new dataFrame for ploting the labeled points. 
dataframe = pd.DataFrame(data= new_coordinates, columns=("1st_principal", "2nd_principal", "label"))
print(dataframe.head())

In [None]:
#Ploting the 2d datapoints with seaborn
import seaborn as sns
sns.FacetGrid(dataframe, hue='label', size= 6).map(plt.scatter, '1st_principal', '2nd_principal').add_legend()
plt.show()

# PCA using Scikit-Learn

In [None]:
from sklearn import decomposition
pca = decomposition.PCA()

In [None]:
#Configuring the parameters
#the number of components = 2

pca.n_components = 2
pca_data = pca.fit_transform(sample_data)

#pca_reduced will contain the 2-d projects of simple data
print("Shape of PCA reduced = ", pca_data.shape)

In [None]:
pca_data = np.vstack((pca_data.T, labels)).T

#creating a new dataFrame for ploting the labeled points. 
pca_df = pd.DataFrame(data= pca_data, columns=("1st_principal", "2nd_principal", "label"))
sns.FacetGrid(pca_df, hue='label', size= 6).map(plt.scatter, '1st_principal', '2nd_principal').add_legend()
plt.show()

# PCA for dimensionality Reduction: Variance Explained by dimensions

In [None]:
pca.n_components = 784
pca_data = pca.fit_transform(sample_data)

percentage_var_explained = pca.explained_variance_ / np.sum(pca.explained_variance_)

cum_var_explained = np.cumsum(percentage_var_explained)

#Plot PCA spectrun

plt.figure(1, figsize=(6,4))
plt.clf()
plt.plot(cum_var_explained, linewidth = 2)
plt.axis("tight")
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.show()

In [None]:
#End of Notebook