In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train=pd.read_csv("/kaggle/input/mnist-in-csv/mnist_train.csv")
df_train

In [None]:
df_test=pd.read_csv("/kaggle/input/mnist-in-csv/mnist_test.csv")
df_test

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
X_train=df_train.drop('label',axis=1)
Y_train=df_train['label']

In [None]:
X_test=df_test.drop('label',axis=1)
Y_test=df_test['label']

In [None]:
# X_train

In [None]:
plt.imshow(X_train.iloc[9000].to_numpy().reshape(28,28))

In [None]:
Y_train[9000]

# Here we are seeing that each image have 28 features due to which the training time increases and performance sometimes decreases..

# PCA Compression-->

In [None]:
#we will keep track of the time taken to perform each transformation and training.
import time

# Let's compress our data using PCA to a degree that preserves 95% variance of the data and only losses only 5%.

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=0.95)
start = time.time()

X_red= pca.fit_transform(X_train)
end = time.time()

end - start

# So PCA took 6 seconds to compress the data..

In [None]:
X_train.shape

In [None]:
X_red.shape

In [None]:
pca.n_components_

# So, only 154 out of 784 features can preserve 95% of the data, 
# This means that the MNIST is originally very sparse and most of the data is rather present at a much lower dimension. 

In [None]:
plt.imshow(X_train.iloc[9000].to_numpy().reshape(28,28))

In [None]:
plt.imshow(X_red[1].reshape(7,22))

# Nothing is visible..

# Now lets reverse back the compression, using PCA reverse i. inverse transformation. While it reverses the dataset back to having 784 features but the information lost(5%) due to compression never gets recovered.

In [None]:
X_return=pca.inverse_transform(X_red)

In [None]:
X_return.shape

# See our dimension is back with 784 features..

In [None]:
plt.imshow(X_return[9000].reshape(28,28))

# See our image is blurred much looking like it has lost dimensions.

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.imshow(X_train.iloc[9000].to_numpy().reshape(28,28))
ax2.imshow(X_return[9000].reshape(28, 28))
fig.suptitle('Compression and Decompression')
ax1.axis('off')
ax2.axis('off')
plt.show()

# So here we see losing 5% of the image still looks acceptable.
# And we have significantly reduced the number of dimensions as well, from 784 to 154 trading a 5% loss in image quality..

# We have seen PCA helps in reducing the size of data upto a harmless level.
# Now lets check does the reduced number of dimensions helps in faster training?

In [None]:
# First trying for Logistic Regression

from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', random_state = 42)
t_start = time.time()
log_clf.fit(X_train, Y_train)
t_end= time.time()

In [None]:
t_end-t_start

In [None]:
log_clf.score(X_test, Y_test)

# Now lets check for our reduced set..

In [None]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', random_state = 42)
t_start = time.time()
log_clf.fit(X_red, Y_train)
t_end= time.time()

In [None]:
t_end-t_start

In [None]:
X_red_test = pca.transform(X_test)
log_clf.score(X_red_test, Y_test)

In [None]:
X_red_test.shape

# # See here we see that it was computed 3 times faster than our previous model and accuracy is somewaht similar..

# Lets try for a Random Forest Model-->

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state = 42)
t_start = time.time()
rfc.fit(X_train,Y_train)
t_end = time.time()

In [None]:
t_end-t_start

In [None]:
rfc.score(X_test, Y_test)

# Now lets check for our reduced set..

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state = 42)
t_start = time.time()
rfc.fit(X_red,Y_train)
t_end = time.time()

In [None]:
t_end-t_start

In [None]:
rfc.score(X_red_test, Y_test)

# That's bad! Training took more than twice the time it took on original dataset. There is a drop in performance as well! So PCA didn't really help in this case. 

# FINDINGS-->

# 1. We saw that PCA didn't help the Random Forest rather slowed down the training and even worsened the performance.
# 2. While in case of Logistic Regression, PCA helped and boosted the training nearly 3 times faster with similar performance..

# CONCLUSION-->

# 1. There you have it clearly, PCA helps,but not always
# 2. Infact, Dimensionality Reduction does not always leads to faster training, it rather depends on the dataset, the model and the training algorithm used.

# Now trying some Non-Linear Dimensionality Reduction--> 

# T-SNE-->
Since TSNE scales extremely slowly with large dataset, we will not use the full data, rather a sample of just 10000 instances.

In [None]:
X_train['label'] = Y_train
X = X_train.sample(n=10000, random_state=42)

Y = X['label']
X = X.drop('label', axis = 1)

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
#We will use TSNE to reduce the datset down to 2 Dimensions and then plot it using Matplotlib
from sklearn.manifold import TSNE
tsne = TSNE(n_components = 2, random_state = 42)
t_start = time.time()
X_reduced = tsne.fit_transform(X)
t_end = time.time()

In [None]:
t_end-t_start

# So it took around four minutes to to compress the data to 2 dimensions with 10000 smaples..

# Now plotting with Matplotlib-->

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(X_reduced[:,0], X_reduced[:,1], c = Y, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()


That looks quite nice! we can see clear separation of clusturs. while only a couple of these clusters seems to overlap, like 3s & 5s and 9s & 4s.

# We can now try for PCS+TSNE-->

In [None]:
from sklearn.pipeline import Pipeline
# Using Pipelines..
pca_tsne = Pipeline([
    ('pca', PCA(n_components=0.95, random_state=42)),
    ('tsne', TSNE(n_components=2, random_state=42)),
])
t_start = time.time()
X_new = pca_tsne.fit_transform(X)
t_end = time.time()
print(t_end-t_start)

plt.figure(figsize=(12, 8))
plt.scatter(X_new[:,0], X_new[:,1], c = Y, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()


WOW! Did you notice what happened ? The result is quite exactly similar to that of using TSNE alone. But the time is reduced to about half of orginial time(just TSNE).

Well, we already saw earlier that PCA is very fast compressor than others when it comes to large datasets, but algorithms like TSNE creats far better clusters than PCA, it make sense combining PCA(to quickly get rid of useless dimensions) and TSNE(a slower reduction algorithm reducing less heavy data to 2 Dimensions to make good clusters). This can significantly reduce the time.

# Now trying some more methods:-->

# LLE: Locally Linear Embedding:-->

In [None]:
from sklearn.manifold import LocallyLinearEmbedding
t_start = time.time()
X_lle = LocallyLinearEmbedding(n_components=2, random_state=42).fit_transform(X)
t_end = time.time()
print(t_end-t_start)

plt.figure(figsize=(12, 8))
plt.scatter(X_lle[:,0], X_lle[:,1], c = Y, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()

 It took a while, and also, the visualization is not at all appealing.

let's now chain this with PCA

# PCA+LLA:-->

In [None]:
pca_lle = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("lle", LocallyLinearEmbedding(n_components=2, random_state=42)),
])
t_start = time.time()
X_new1= pca_lle.fit_transform(X)
t_end = time.time()
print(t_end-t_start)

plt.figure(figsize=(12, 8))
plt.scatter(X_new1[:,0], X_new1[:,1], c = Y, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()

Well, while the results were same, the time was quite reduced. That's what we had expected!

Let's try a last one! LDA

# Linear Discriminant Analysis(LDA):-->

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

t_start = time.time()
X_lda = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, Y)
t_end = time.time()
print(t_end-t_start)

plt.figure(figsize=(12, 8))
plt.scatter(X_lda[:,0], X_lda[:,1], c = Y, cmap='jet')
plt.colorbar()
plt.axis('off')
plt.show()

# Wow! this was quite faster! Although the clusturs are fine, not good.!

# And I guess, We have a clear Winner here, yes..that's TSNE !! It was faster when chained with PCA and the results as well were quite better than others.

# FINDINGS:-->

# 1. TSNE out-performed other algorithm at making clear clusters.
# 2. PCA helped other algorithms to perform faster reduction.
# 3. PCA scales faster than other algorithms but is not that good in creating clusters.
# 4. Manifold based algorithms scale very poorly with larger dataset, hence are very slow.

# CONCLUSIONS:-->

# 1. Manifold based reduction methods scale very poorly with larger dataset, hence are very slow.
# 2. Chaining PCA with Manifold based reduction methods can help them scale faster 