In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import matplotlib.pyplot as plt
from keras import backend as k 
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import os, seaborn as sns, pandas as pd, numpy as np

# Reading Data

In [None]:
X_raw_train = pd.read_csv('../input/mnist-in-csv/mnist_train.csv')
y_train = X_raw_train['label']
x_train = X_raw_train.drop('label', axis = 1)


X_raw_test = pd.read_csv('../input/mnist-in-csv/mnist_test.csv')
y_test = X_raw_test['label']
x_test = X_raw_test.drop('label', axis = 1)


### Changing Dim and scaling

In [None]:
img_rows, img_cols=28, 28

if k.image_data_format() == 'channels_first': 
    x_train = x_train.to_numpy().reshape(x_train.shape[0], 1, img_rows, img_cols) 
    x_test = x_test.to_numpy().reshape(x_test.shape[0], 1, img_rows, img_cols) 
    inpx = (1, img_rows, img_cols) 

else: 
    x_train = x_train.to_numpy().reshape(x_train.shape[0], img_rows, img_cols, 1) 
    x_test = x_test.to_numpy().reshape(x_test.shape[0], img_rows, img_cols, 1) 
    inpx = (img_rows, img_cols, 1) 

x_train = x_train.astype('float32') 
x_test = x_test.astype('float32') 
x_train /= 255
x_test /= 255


In [None]:
x_train_flat = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test_flat = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train_flat.shape)
print(x_test_flat.shape)

# Baseline CNN

In [None]:
num_classes = 10

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_C = keras.utils.to_categorical(y_test, num_classes)

In [None]:
# Let's build a CNN 

model_1 = Sequential()


## 5x5 convolution with 2x2 stride and 32 filters
model_1.add(Conv2D(32, (5, 5), strides = (2,2), padding='same',
                 input_shape=x_train.shape[1:]))
model_1.add(Activation('relu'))

## Another 5x5 convolution with 2x2 stride and 32 filters
model_1.add(Conv2D(32, (5, 5), strides = (2,2)))
model_1.add(Activation('relu'))

## 2x2 max pooling reduces to 3 x 3 x 32
model_1.add(MaxPooling2D(pool_size=(2, 2)))
model_1.add(Dropout(0.25))

## Flatten turns 3x3x32 into 288x1
model_1.add(Flatten())
model_1.add(Dense(512))
model_1.add(Activation('relu'))
model_1.add(Dropout(0.5))
model_1.add(Dense(num_classes))
model_1.add(Activation('softmax'))

model_1.summary()

In [None]:
batch_size = 1000

# initiate RMSprop optimizer
opt = keras.optimizers.RMSprop(lr=0.0005, decay=1e-6)

# Let's train the model using RMSprop
model_1.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model_1.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=20,
              validation_data=(x_test, y_test_C),
              shuffle=True)

### Classification report

In [None]:
y_pred_baseline = model_1.predict(x_test)
y_pred = []
for i in range(10000):
    y_pred.append(y_pred_baseline[i].argmax())
    
print(classification_report(y_test, y_pred))

In [None]:
_, ax = plt.subplots(figsize=(8,8))

ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', annot_kws={"size": 10, "weight": "bold"})  
ax.set_ylabel('Prediction', fontsize=15);
ax.set_xlabel('Ground Truth', fontsize=15)

# Using PCA

### helper fuction

In [None]:
from sklearn.decomposition import PCA

def mnist_pca(x_data, n_components):
    pca = PCA(n_components=n_components)

    fit_pca = pca.fit(x_data)
    
    print("Variance explained with {0} components:".format(n_components), 
          round(sum(fit_pca.explained_variance_ratio_), 2))

    return fit_pca, fit_pca.transform(x_data)

In [None]:
pca_full, mnist_data_full = mnist_pca(x_train_flat, 784)

### Curvature of variance explained

In [None]:
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.title("Proportion of PCA variance\nexplained by number of components")
plt.xlabel("Number of components")
plt.ylabel("Proportion of variance explained");

### Selecting the number of compnents and getting the data

In [None]:
pca_324, mnist_data_324 = mnist_pca(x_train_flat, 324)

In [None]:
pca = PCA(n_components=324)
fit_pca = pca.fit(x_train_flat)
X_train_P = fit_pca.transform(x_train_flat)
X_test_P = fit_pca.transform(x_test_flat)
X_test_P.shape

In [None]:
# since 18*18 is 324
img_rows, img_cols=18, 18

if k.image_data_format() == 'channels_first': 
    x_train_P = X_train_P.reshape(x_train.shape[0], 1, img_rows, img_cols) 
    x_test_P = X_test_P.reshape(x_test.shape[0], 1, img_rows, img_cols) 
    inpx = (1, img_rows, img_cols) 

else: 
    x_train_P = X_train_P.reshape(x_train.shape[0], img_rows, img_cols, 1) 
    x_test_P = X_test_P.reshape(x_test.shape[0], img_rows, img_cols, 1) 
    inpx = (img_rows, img_cols, 1) 

In [None]:
# Let's build the CNN 

model_2 = Sequential()


## 5x5 convolution with 2x2 stride and 32 filters
model_2.add(Conv2D(32, (5, 5), strides = (2,2), padding='same',
                 input_shape=x_train_P.shape[1:]))
model_2.add(Activation('relu'))

## Another 5x5 convolution with 2x2 stride and 32 filters
model_2.add(Conv2D(32, (5, 5), strides = (2,2)))
model_2.add(Activation('relu'))

## 2x2 max pooling reduces to 3 x 3 x 32
model_2.add(MaxPooling2D(pool_size=(2, 2)))
model_2.add(Dropout(0.25))

## Flatten turns 3x3x32 into 288x1
model_2.add(Flatten())
model_2.add(Dense(512))
model_2.add(Activation('relu'))
model_2.add(Dropout(0.5))
model_2.add(Dense(num_classes))
model_2.add(Activation('softmax'))

model_2.summary()

In [None]:
batch_size = 1000

# initiate RMSprop optimizer
opt = keras.optimizers.RMSprop(lr=0.0005, decay=1e-6)

# Let's train the model using RMSprop
model_2.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model_2.fit(x_train_P, y_train,
              batch_size=batch_size,
              epochs=20,
              validation_data=(x_test_P, y_test_C),
              shuffle=True)

In [None]:
y_pred_baseline = model_2.predict(x_test_P)

In [None]:
y_pred_2 = []
for i in range(10000):
    y_pred_2.append(y_pred_baseline[i].argmax())

In [None]:
print(classification_report(y_test, y_pred_2))

In [None]:
_, ax = plt.subplots(figsize=(8,8))

ax = sns.heatmap(confusion_matrix(y_test, y_pred_2), annot=True, fmt='d', annot_kws={"size": 10, "weight": "bold"})  
ax.set_ylabel('Prediction', fontsize=15);
ax.set_xlabel('Ground Truth', fontsize=15)

# Some conclusion

We saw that applying a dimensionality reduction technique we could **drop down our training
time 50% for training a CNN**. In a more complex problem, 50% time reduction could mean
saving hours or even days​ . We saw that the model ​ had a lower performance. ​ In this
case, we had a ​ **trade off between ​ losing 4% accuracy and reducing 50% training time​** .
Depending on the problem this trade off might be worth it or not. Obviously in this case,
when we save just around 30 seconds, it is better to use the full model. But a 4% performancereduction 
for saving hours and a lot of energy in training time might be a really advantageous
trade off in a lot of cases. **We saw that PCA had a really great performance in reducing
dimensionality without losing too much information.**