In [None]:
import time
import itertools
import json
import panel as pn
import numpy as np
import pandas as pd
import holoviews as hv
import folium
import seaborn as sns
import matplotlib.pyplot as plt
import datetime, os

from panel.widgets import Tqdm

from folium.plugins import HeatMap, HeatMapWithTime, MarkerCluster

from bokeh import models, plotting, io
from bokeh.layouts import column
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, RangeTool, BoxAnnotation

from holoviews import opts
from holoviews.streams import Pipe, Buffer
from holoviews.operation.timeseries import rolling, rolling_outlier_std

from scipy.linalg import LinAlgError

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.statespace.sarimax import SARIMAX

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.pipeline import Pipeline

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint


import keras_tuner as kt
import tensorflow as tf
from tensorflow import keras



hv.extension('bokeh', 'plotly')
hv.renderer('bokeh').theme= 'dark_minimal'

%matplotlib inline

<p style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:400%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>CNN</b></p>

# Please give me an UPVOTE if you can. Your UPVOTE will be a great encouragement to me!

In [None]:
train_set = pd.read_csv('../input/digit-recognizer/train.csv')

In [None]:
train_set.head(2)

In [None]:
train_set.shape

In [None]:
test_set = pd.read_csv('../input/digit-recognizer/test.csv')

In [None]:
test_set.head(2)

In [None]:
test_set.shape

In [None]:
train_set.info(), test_set.info()

In [None]:
X_train = train_set.drop(labels=['label'], axis=1)
Y_train = train_set['label']

---
---

In [None]:
y_count = pd.pivot_table(Y_train.reset_index(), values='index', index='label', aggfunc='count')

In [None]:
hv.Bars(y_count).opts(width=600, height=500, tools=['hover'], color='Green')

In [None]:
image_1 = X_train.iloc[0].values
image_1 = image_1.reshape((28, 28))

image_2 = X_train.iloc[3].values
image_2 = image_2.reshape((28,28))

In [None]:
img_1 = hv.Image(image_1)
img_2 = hv.Image(image_2)
img_1.opts(width=600, height=500, tools=['hover'])

In [None]:
img_2.opts(width=600, height=500, tools=['hover'])

In [None]:
X_train = X_train / 255.0
test = test_set / 255.0
print("x_train shape : {} ".format(X_train.shape))
print("test shape: {} ".format(test.shape))

In [None]:
# Reshape
X_train = X_train.values.reshape(-1,28,28,1)
test = test_set.values.reshape(-1,28,28,1)
print("x_train shape: {}".format(X_train.shape))
print("test shape: {}".format(test.shape))

In [None]:
Y_train = to_categorical(Y_train, num_classes=10)

In [None]:
Y_train.shape

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)
print("x_train shape : {}, dtype : {}".format(X_train.shape, X_train.dtype))
print("x_test shape : {}, dtype : {}".format(X_val.shape, X_val.dtype))
print("y_train shape : {}, dtype : {}".format(Y_train.shape, Y_train.dtype))
print("y_test shape : {}, dtype : {}".format(Y_val.shape, Y_val.dtype))

In [None]:
hv.Image(X_train[2][:,:,0])

<a id="1"></a>
### <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:250%;text-align:center;border-radius:10px 10px;">Keras-Tuner</p>

* Adjust the number of units in the first Dense layer. Just define an integer hyperparameter in hp.Int('units', min_value=32, max_value=512, step=32). Its range is from 32 to 512. The minimum step to walk the interval is 32 if sampling from it.

* There are many other types of hyperparameters. In this function, you can define multiple hyperparameters. In the following code, we tune whether to use the dropout layer with hp.Boolean(), the activation function with hp.Choice(), and the learning rate of the optimizer with hp.Float().

---
* * The architecture of a CNN takes the form of several convolutional layers (Conv layers) stacked on top of each other, followed by pooling layers, convolutional layers, and pooling layers; as the CNN moves forward, the information passed to the input becomes smaller, but also deeper due to the effect of the convolutional layers and the generation of feature maps. At the top of the CNN stack is an additional neural network, which is usually an advancing neural network consisting of several fully coupled layers, with the last normal layer outputting predictions. If the normal layer outputs the probability of a class, use the softmax layer.

* The caveat to using Conv layers with large kernels other than the initial Conv layer is that rather than using a Conv layer with a 5×5 kernel, stacking two 3×3 Conv layers will usually give better performance than using a larger kernel. The reason why the first Conv layer is an exception is that by using a 5×5 kernel with stride 2, it is possible to reduce the dimensionality of the space without losing too much information, and it also solves the cost problem.

* Increasing the number of filters as we move towards the output layer. Although the number of low-level features is often small, the number of features created by combining them is large, so it makes sense to do it this way. The most common method is to double the number of filters each time it passes through the pooling layer.

* In the full join layer, it is assumed that the features for each instance are in a 1D array, so the input has to be flattened before the full join layer.
* The subsequent dropout layer is used to mitigate overtraining, and is conditionalized by parameter search.
---
* * For the first layer in the following function, if the input image is not very large, it is better to use a larger value and no stride as a filter.

### <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:250%;text-align:center;border-radius:10px 10px;">optimizer</p>

* optimizer='sgd' : To train the model using a simple stochastic gradient descent method. Run a back-propagation algorithm (a combination of reverse-mode automatic differentiation and gradient descent).
    - When using the sgd optimizer, it is important to adjust the learning rate. So, in the general syntax, use the following to adjust the learning rate. Default is lr=0.01
        - optimizer=keras.optimizer.SGD(lr=n)
        - Adam, Nadam...
---
* Efficient optimization
    - Streamline the gradient descent part.
    - Keep the automatic differentiation part.
 
* The reason for using loss='sparse_categorical_crossentropy' is when the labels are sparse (only 0 to 9 target class indices for individual instances) and the classes are mutually exclusive.
* loss='categorical_crossentropy' loss function is used when the target probability is calculated for each class in each instance (0, 0, 0, 1, ...) 0, 0, 0, ...) This loss function such as

* In the case of binary classification (with one anomalous binary label), use "sigmoid" (logistic function) instead of "softmax" as the activation function for the output layer, so use loss="binary_crossentropy".
-----
- The reason why we use loss='categorical_crossentropy' is because from tensorflow.keras.utils import to_categorical Y_train = to_categorical(Y_train, num_ classes=10) to implement one-hot encoding.

---
* metrics="accuracy" Since this is a classification problem, we measure the percentage of correct answers during training evaluation.

In [None]:
def build_model(hp):
    model = keras.Sequential()
    model.add(Conv2D(32, kernel_size=(5,5),
                     activation="relu", kernel_initializer='he_normal',
                     input_shape=(28,28,1)))
    model.add(Conv2D(32, kernel_size=(3, 3), 
                     activation='relu',kernel_initializer='he_normal'))
    model.add(MaxPool2D(pool_size=(2,2)))
    
    model.add(Dropout(hp.Choice('rate', [0.2, 0.25, 0.3, 0.4, 0.5]))) #0.25
    
    model.add(Conv2D(64, kernel_size=(3,3), padding='Same',
                     activation="relu"))
    model.add(Conv2D(64, kernel_size=(3,3), padding='Same',
                     activation="relu"))
    
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(hp.Choice('rate', [0.2, 0.25, 0.3, 0.4, 0.5])))
    
    model.add(Flatten())
    
    model.add(Dense(256, activation="relu"))
    if hp.Boolean("dropout"):
        model.add(Dropout(hp.Choice('rate', [0.2, 0.25, 0.3, 0.4, 0.5])))
    model.add(Dense(10, activation="softmax"))

    
    learning_rate = hp.Float("learning_rate", min_value=1e-4, max_value=1e-2, sampling="log")
    optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999)
    model.compile(
        optimizer=optimizer,
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

In [None]:
build_model(kt.HyperParameters())

In [None]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy', 
    max_trials=2) 

In [None]:
tuner.search(X_train, Y_train, epochs=2, validation_data=(X_val, Y_val))
best_model = tuner.get_best_models()[0]

In [None]:
best_model.summary()

In [None]:
keras.utils.plot_model(best_model, show_shapes=True)

In [None]:
epochs = 25
batch_size = 250

### <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">Data expansion</p>

In [None]:
datagen = ImageDataGenerator(
        featurewise_center=False, samplewise_center=False,
        featurewise_std_normalization=False, samplewise_std_normalization=False, 
        zca_whitening=False, rotation_range=12, zoom_range = 0.18,
        width_shift_range=5, height_shift_range=5,  
        horizontal_flip=False, vertical_flip=False)

datagen.fit(X_train)

### <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">Early Stopping</p>

* Abort training and optionally rollback to the best model if performance on the validation set does not improve by the number of epochs specified by the patience argument.
* Use checkPoint to save model checkpoints (in case of PC crash) and terminate training early when performance fails (time and computational resources).
* Maintain the best weights and restore them at the end of the training (basically, there is no need to save and restore the best model when using together).

In [None]:
early_stopping = EarlyStopping(
                        monitor='val_loss',
                        min_delta=0.0,
                        patience=10, restore_best_weights=True)

### <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">Scheduling by performance</p>


- Multiply the learning rate by 0.5 if the best verification loss does not improve in 5 consecutive epochs

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

* If you use a validation set during training, you can set save_best_only=True when you create a checkpoint. In this case, you can save the model only when its performance against the validation set is the best.
* Using this, you don't have to worry about overtraining the training set forever.
    - If we simply restore the saved model after training, it will be the best performing model for the validation set. This is the model callback that corresponds to early stopping

In [None]:
checkpoint = ModelCheckpoint("keras_model.h5", save_best_only=True)

* Overlearning management, custom callbacks

In [None]:
class ValTrainRatioCustomCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        print("\nval/train: {:.2f}".format(logs["val_loss"] / logs["loss"]))

custom_call = ValTrainRatioCustomCallback()

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

* X_train: input features, Y_train: target class
    - Number of training epochs: Unspecified, default is 1, but 1 is not enough to converge to a good solution, so it must be specified.
    - validation_data=(X_val, Y_val) : By passing the validation set, it measures and displays the loss and other indicators for the validation set.
        - If the performance on the training set is much better than the performance on the validation set, then the model is probably overtraining the training set. If the performance on the training set is much better than the performance on the validation set, then the model is probably overtraining the training set. Other possible bugs exist, such as data gaps between the training and validation sets.
    - You can also use validation_split=n to specify how much of the training set to use as validation data, without passing the validation set in the validation_data argument. 0.1 will use 10% from the end of the training set.

In [None]:
history = best_model.fit(datagen.flow(X_train,Y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_val,Y_val), steps_per_epoch=X_train.shape[0] // batch_size,
                              callbacks=[learning_rate_reduction, early_stopping, checkpoint, custom_call, tensorboard_callback])

In [None]:
model_h5 = keras.models.load_model('keras_model.h5')

# <p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">TensorBoard</p>

In [None]:
from tensorboard import notebook
notebook.list()
#notebook.display(port=6006, height=800)

In [None]:
%reload_ext tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

In [None]:
loss = hv.Curve(pd.DataFrame(history.history)['loss'], label='loss')
accuracy = hv.Curve(pd.DataFrame(history.history)['accuracy'], label='accuracy')
val_loss = hv.Curve(pd.DataFrame(history.history)['val_loss'], label='val_loss')
val_accuracy = hv.Curve(pd.DataFrame(history.history)['val_accuracy'], label='val_accuracy')

In [None]:
overlay_0 = (loss * val_loss).relabel('Loss').opts(ylabel='Loss', xlabel='Epochs')
overlay_1 = (accuracy * val_accuracy).relabel('Accuracy').opts(ylabel='Accuracy', xlabel='Epochs')
(overlay_0 + overlay_1).opts(opts.Curve(width=500, tools=['hover']), opts.Overlay(legend_position='right'))

In [None]:
hv.Curve(history.history['val_loss'])

In [None]:
Y_pred = best_model.predict(X_val)
 
Y_pred_classes = np.argmax(Y_pred,axis=1) 

Y_true = np.argmax(Y_val,axis=1) 

In [None]:
print(classification_report(Y_val, Y_pred.astype(int), zero_division=True))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(Y_true, Y_pred_classes))
fig, ax = plt.subplots(figsize=(10,10))
disp.plot(cmap='Blues', ax=ax);

In [None]:
best_model.evaluate(test)

In [None]:
results = best_model.predict(test)

In [None]:
results = np.argmax(results, axis=1)

In [None]:
sub = pd.read_csv('../input/digit-recognizer/sample_submission.csv')
sub['Label'] = results
sub.to_csv('submission.csv', index=False)