# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import keras.layers as L
import keras.models as M
import keras.initializers as I
import keras.backend as K
from keras import optimizers
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import matplotlib.pyplot as plt

# Importing the data

In [None]:
data=pd.read_csv('../input/digit-recognizer/train.csv')
data.head()

In [None]:
X=data.drop('label',axis=1).values
y2=data['label'].values

# Making squash function

In [None]:
# Makign the squash function
def squash(vectors, axis=-1):
    squared_norm=K.sum(K.square(vectors),axis,keepdims=True)
    scale=squared_norm/(1+squared_norm)/(K.sqrt(squared_norm)+K.epsilon())
    return scale*vectors
    

# Adding Input layer,convulational layers and primary capsule

In [None]:
img_shape=(28,28,1)
inp=L.Input(img_shape,100)
# Adding the first conv1 layer
conv1=L.Conv2D(filters=256,kernel_size=(2,2),activation='relu',padding='valid')(inp)
# Adding Maxpooling layer
maxpool1=L.MaxPooling2D(pool_size=(1,1))(conv1)
# Adding second convulational layer
conv2=L.Conv2D(filters=128,kernel_size=(9,9),activation='relu',padding='valid')(maxpool1)
# Adding primary cap layer
conv2=L.Conv2D(filters=8*16,kernel_size=(9,9),strides=2,padding='valid',activation=None)(conv2)
# Adding the squash activation
reshape2=L.Reshape([-1,8])(conv2)
squashed_output=L.Lambda(squash)(reshape2)

# Making Capsule Layer

In [None]:
# Making capsule layer from scratch
class CapsuleLayer(L.Layer):
    def __init__(self,num_capsule,dim_capsule,routing=3,kernel_initializer='glorot_uniform',**kwargs):
        super(CapsuleLayer,self).__init__(**kwargs)
        self.num_capsule=num_capsule
        self.dim_capsule=dim_capsule
        self.routing=routing
        self.kernel_initializer=kernel_initializer
    def build(self,input_shape):
        assert len(input_shape) >= 3
        self.input_num_capsule=input_shape[1]
        self.input_dim_capsule=input_shape[2]
        
        #transforming the matrix
        self.W= self.add_weight(shape=[self.num_capsule,self.input_num_capsule,self.dim_capsule,self.input_dim_capsule],initializer=self.kernel_initializer,name='w')
        self.built=True
    def call(self,inputs,training=None):
        input_expand=tf.expand_dims(tf.expand_dims(inputs,1),-1)
        inputs_tiled=K.tile(input_expand,[1,self.num_capsule,1,1,1])
        input_hat=tf.squeeze(tf.map_fn(lambda x: tf.matmul(self.W,x),elems=inputs_tiled))
        b=tf.zeros(shape=[inputs.shape[0],self.num_capsule,1,self.input_num_capsule])
        assert self.routing > 0
        for i in range(self.routing):
            c=tf.nn.softmax(b,axis=1)
            output=squash(tf.matmul(c,input_hat))
            if i<self.routing-1:
                b+=tf.matmul(output,input_hat,transpose_b=True)
        return tf.squeeze(output)
    def compute_output_shape(self,input_shape):
        return tuple([None,self.num_capsule,self.dim_capsule])
    def get_config(self):
        config = {
            'num_capsule': self.num_capsule,
            'dim_capsule': self.dim_capsule,
            'routings': self.routing
        }
        base_config = super(CapsuleLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
        

In [None]:
digitcaps = CapsuleLayer(num_capsule=10, dim_capsule=16, routing=3, name='digitcaps')(squashed_output)

# Making length layer which will calculate the length of the vectors

In [None]:
class Length(L.Layer):
    def call(self,inputs,**kwargs):
        return tf.sqrt(tf.reduce_sum(tf.square(inputs),-1))
    def compute_output_shape(self,input_shape):
        return input_shape[:-1]
    def get_config(self):
        config = super(Length, self).get_config()
        return config

In [None]:
# Layer 4: This is an auxiliary layer to replace each capsule with its length. Just to match the true label's shape.
# If using tensorflow, this will not be necessary. :)
out_caps = Length(name='capsnet')(digitcaps)

# Making the Masking layer

In [None]:
# Making the masking layer
class Mask(L.Layer):
    def call(self,inputs,**kwargs):
        if type(inputs) is list:
            assert len(inputs)==2
            inputs,mask=inputs
        else:
            x=tf.sqrt(tf.reduce_sum(tf.square(inputs),-1))
            mask=tf.one_hot(indices=tf.argmax(x, 1), depth=x.shape[1])
        masked=K.batch_flatten(inputs*tf.expand_dims(mask,-1))
        return masked
    def compute_output_shape(self, input_shape):
        if type(input_shape[0]) is tuple:  # true label provided
            return tuple([None, input_shape[0][1] * input_shape[0][2]])
        else:  # no true label provided
            return tuple([None, input_shape[1] * input_shape[2]])

    def get_config(self):
        config = super(Mask, self).get_config()
        return config

In [None]:
y = L.Input(shape=(10,))
masked_by_y = Mask()([digitcaps, y])  # The true label is used to mask the output of capsule layer. For training
masked = Mask()(digitcaps)

# Making the decoder model

In [None]:
decoder = M.Sequential(name='decoder')
decoder.add(L.Dense(512, activation='relu', input_dim=16 * 10))
decoder.add(L.Dense(1024, activation='relu'))
decoder.add(L.Dense(np.prod((28,28,1)), activation='sigmoid'))
decoder.add(L.Reshape(target_shape=(28,28,1), name='out_recon'))

# Making models

In [None]:
train_model = M.Model([inp, y], [out_caps, decoder(masked_by_y)])
eval_model = M.Model(inp, [out_caps, decoder(masked)])

# Making the loss function

In [None]:
def margin_loss(y_true, y_pred):
    """
    Margin loss for Eq.(4). When y_true[i, :] contains not just one `1`, this loss should work too. Not test it.
    :param y_true: [None, n_classes]
    :param y_pred: [None, num_capsule]
    :return: a scalar loss value.
    """
    # return tf.reduce_mean(tf.square(y_pred))
    L = y_true * tf.square(tf.maximum(0., 0.9 - y_pred)) + \
        0.5 * (1 - y_true) * tf.square(tf.maximum(0., y_pred - 0.1))

    return tf.reduce_mean(tf.reduce_sum(L, 1))

# Final Training Model

In [None]:
train_model.summary()

# Training the model

In [None]:
X=np.array(X)
y2=np.array(y2)
X_train, X_test, y_train2, y_test2 = train_test_split(X, y2, test_size=0.1, random_state=42)
x_train = X_train.astype('float32') / 255.
x_train = x_train.reshape(-1,28,28,1)
y_train = np.array(to_categorical(y_train2.astype('float32')))

x_test = X_test.astype('float32') / 255.
x_test = x_test.reshape(-1,28,28,1)
y_test = np.array(to_categorical(y_test2.astype('float32')))

x_output = x_train.reshape(-1,784)
X_valid_output = x_test.reshape(-1,784)

n_samples = 5

plt.figure(figsize=(n_samples * 2, 3))
for index in range(n_samples):
    plt.subplot(1, n_samples, index + 1)
    sample_image = x_test[index].reshape(28, 28)
    plt.imshow(sample_image, cmap="binary")
    plt.title("Label:" + str(y_test2[index]))
    plt.axis("off")

plt.show()

In [None]:
import os 
if 'Capsule net.h5' not in os.listdir('./'):
    m = 100
    epochs = 16
    # Using EarlyStopping, end training when val_accuracy is not improved for 10 consecutive times
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_capsnet_accuracy',mode='max',
                                        patience=2,restore_best_weights=True)

    # Using ReduceLROnPlateau, the learning rate is reduced by half when val_accuracy is not improved for 5 consecutive times
    lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_capsnet_accuracy',mode='max',factor=0.5,patience=4)
    train_model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss=[margin_loss,'mse'],loss_weights = [1. ,0.0005],metrics=['accuracy'])
    train_model.fit([x_train, y_train],[y_train,x_train], batch_size = m, epochs = 1, validation_data = ([x_test, y_test],[y_test,x_test]),callbacks=[early_stopping,lr_scheduler])
else:
    train_model=M.load_model('./Capsule net.h5')

In [None]:
train_model.save('Capsule net.h5')

In [None]:
label_predicted, image_predicted = train_model.predict([x_test[:4000], y_test[:4000]])


In [None]:
n_samples = 5

plt.figure(figsize=(n_samples * 2, 3))
for index in range(n_samples):
    plt.subplot(1, n_samples, index + 1)
    sample_image = x_test[index].reshape(28, 28)
    plt.imshow(sample_image, cmap="binary")
    plt.title("Label:" + str(y_test2[index]))
    plt.axis("off")

plt.show()

plt.figure(figsize=(n_samples * 2, 3))
for index in range(n_samples):
    plt.subplot(1, n_samples, index + 1)
    sample_image = image_predicted[index].reshape(28, 28)
    plt.imshow(sample_image, cmap="binary")
    plt.title("Predicted:" + str(np.argmax(label_predicted[index])))
    plt.axis("off")

plt.show()

# Making Submission File

In [None]:
test_file=pd.read_csv('../input/digit-recognizer/test.csv')
test_file.head()

In [None]:
test_x=test_file.values
test_x=np.array(test_x)

In [None]:
test_x = test_x.astype('float32') / 255.
test_x = test_x.reshape(-1,28,28,1)

In [None]:
op=eval_model.predict(test_x)

In [None]:
predictions=[]
for i in op[0]:
    predictions.append(np.argmax(i))

In [None]:
# Making the submission file
submission=pd.DataFrame()
submission['ImageId']=[i+1 for i in range(len(predictions))]
submission['Label']=predictions
submission.to_csv('Submission.csv',index=False)

# Thank you