# PCA MNIST Database

### Application - Speeding up machine learning algorithms

MNIST Database: database of handwritten digits with 784 feature columns (784 dimensions), a training set of 60,000 examples, and a test set of 10,000 examples.

In [1]:
# Downloading the database
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')

In [12]:
# Test train split
from sklearn.model_selection import train_test_split

# 6/7th of the data for training and 1/7th of the data for testing.
train_img, test_img, train_lbl, test_lbl = train_test_split( mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [13]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

In [14]:
# Applying algorithm
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter=100)

logisticRegr.fit(train_img, train_lbl)

logisticRegr.predict(test_img[0].reshape(1,-1)) # predict single observation



array(['0'], dtype=object)

In [15]:
print("Score before PCA: ")
logisticRegr.score(test_img, test_lbl)

Score before PCA: 


0.9126

In [16]:
# Apply PCA
from sklearn.decomposition import PCA

# scikit-learn will choose the minimum number of principal components such that 95% of the variance is retained.
pca = PCA(.95)

pca.fit(train_img)

print("Number of principal components chosen by scikit-learn: ")
print(pca.n_components_)

Number of principal components chosen by scikit-learn: 
327


In [17]:
# Applying the transform on both training set and test set

train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

In [18]:
# Applying algorithm
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter=100)

logisticRegr.fit(train_img, train_lbl)

logisticRegr.predict(test_img[0].reshape(1,-1)) # predict single observation



array(['0'], dtype=object)

In [20]:
print("Score after PCA: ")
logisticRegr.score(test_img, test_lbl)

Score after PCA: 


0.9201