## Low-dimensional approximation by PCA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
train = pd.read_csv('../input/digit-recognizer/train.csv')
print(train.shape)
train.head()

In [None]:
X_train = train.iloc[:, 1:].values
y_train = train.iloc[:, 0].values
print(X_train.shape, y_train.shape)

In [None]:
pca = PCA(n_components=X_train.shape[1]).fit(X_train)

In [None]:
comps = pca.components_
print(comps.shape)

In [None]:
# Visualize principal components
fig, ax = plt.subplots(10, 10, figsize=(16, 16))

for i in range(10):
    for j in range(10):
        k = i * 10 + j
        
        ax[i, j].imshow(comps[k].reshape(28, 28), cmap='bwr')
        ax[i, j].tick_params(left=False, labelleft=False, bottom=False, labelbottom=False)
        ax[i, j].set_title(f"PCA{k + 1}")

plt.show()

In [None]:
# cumulative contribution ratio
plt.plot(np.cumsum(pca.explained_variance_ratio_));

In [None]:
# Average of each pixel 
m = np.mean(X_train, axis=0)
print(m.shape)
m[10:20]

In [None]:
# Data transformation based on the results of PCA
X_train2 = pca.transform(X_train)
print(X_train2.shape)

In [None]:
def draw_digit(label, n):
    fig, ax = plt.subplots(1, 13, figsize=(16, 8))

    ax[0].imshow(X_train[y_train == label][n].reshape(28, 28), cmap='Greys')
    ax[0].tick_params(left=False, labelleft=False, bottom=False, labelbottom=False)
    ax[0].set_title(f"orig")

    for i in range(12):
        j = i * 5 + 1
        _X3 = np.dot(X_train2[y_train == label][n, :j], comps[:j, :])
        _X3 += m
        
        ax[i + 1].imshow(_X3.reshape(28, 28), cmap='Greys')
        ax[i + 1].tick_params(left=False, labelleft=False, bottom=False, labelbottom=False)
        ax[i + 1].set_title(f"{j} approx")

In [None]:
for i in range(10):
    draw_digit(i, 0)

In [None]:
for i in range(10):
    draw_digit(i, 1)