In [None]:
import keras

In [None]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np

In [None]:
import random
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout,Flatten, Lambda
from keras.layers import Conv2D, Activation,AveragePooling2D,MaxPooling2D
from keras.optimizers import RMSprop

In [None]:
from keras import backend as K

In [None]:
num_classes=10

In [None]:
def create_pairs(x, digit_indices):
  pairs = []
  labels = []
  
  n=min([len(digit_indices[d]) for d in range(num_classes)]) -1
  
  for d in range(num_classes):
    for i in range(n):
      z1, z2 = digit_indices[d][i], digit_indices[d][i+1]
      pairs += [[x[z1], x[z2]]]
      inc = random.randrange(1, num_classes)
      dn = (d + inc) % num_classes
      z1, z2 = digit_indices[d][i], digit_indices[dn][i]
      pairs += [[x[z1], x[z2]]]
      labels += [1,0]
  return np.array(pairs), np.array(labels)

In [None]:
def euclid_dis(x_y):
  x,y = x_y
  sum_square = K.sum(K.square(x-y), axis=1, keepdims=True)
  return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
  
def regularize_cross_entropy(p, q, ets=1e-15):
	return -sum([p[i]*log(q[i]+ets) for i in range(len(p))])

In [None]:
def compute_accuracy(y_true, y_pred):
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)


def accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], 28, 28,1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

input_shape = x_train.shape[1:]

In [None]:
def siamese_model(input_shape):
  
  input = Input(shape = input_shape)
  x = Conv2D(32, (3,3), activation = 'relu')(input)
  x = AveragePooling2D(pool_size = (2,2))(x)
  x = Conv2D(64, (3,3), activation = 'tanh')(x)
  x = MaxPooling2D(pool_size = (2,2))(x)
  x = Dropout(0.25)(x)
  x = Flatten()(x)
  x = Dense(128, activation = 'tanh')(x)
  x = Dropout(0.5)(x)
  x = Dense(64,activation = 'tanh')(x)
  x = Dropout(0.5)(x)
  x = Dense(10,activation = 'tanh')(x)
  model = Model(input, x)
  model.summary()
  return model


In [None]:
digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
pairs_train, y_train = create_pairs(x_train, digit_indices)

digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
pairs_test, y_test = create_pairs(x_test, digit_indices)

base_network = siamese_model(input_shape)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 26, 32)        320       
_________________________________________________________________
average_pooling2d (AveragePo (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 5, 5, 64)          0         
_________________________________________________________________
dropout (Dropout)            (None, 5, 5, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0     

In [None]:
input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclid_dis, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

In [None]:
y_test = np.array(y_test, dtype='float32')
y_train = np.array(y_train, dtype='float32')

In [None]:
def fit_model(optim, n_epochs, loss_func):
    model = Model([input_a, input_b], distance)   
    model.compile(loss=loss_func, optimizer=optim, metrics=[accuracy])
    model.fit([pairs_train[:, 0], pairs_train[:, 1]], y_train, batch_size=128, epochs=n_epochs, validation_data=([pairs_test[:, 0], pairs_test[:, 1]], y_test))
    return model

In [None]:
def predict(model):
    y_pred = model.predict([pairs_train[:, 0], pairs_train[:, 1]])
    tr_acc = compute_accuracy(y_train, y_pred)
    y_pred = model.predict([pairs_test[:, 0], pairs_test[:, 1]])
    te_acc = compute_accuracy(y_test, y_pred)

    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
    return te_acc

**Trying different loss functions**

In [None]:
# LOSS FUNCTION : "regularized_cross_entropy"
opt = 'adam'
n_epochs = 2
regularized_cross_entropy = keras.losses.BinaryCrossentropy()
siam_model = fit_model(opt, n_epochs, regularized_cross_entropy)
test_accuracy_adm = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 50.00%
* Accuracy on test set: 50.00%


In [None]:
# LOSS FUNCTION : "contrastive_loss"
opt = 'adam'
n_epochs = 2
siam_model = fit_model(opt, n_epochs, contrastive_loss)
test_accuracy_adm = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 97.29%
* Accuracy on test set: 96.77%


**Trying different optimizers**

In [None]:
# optimizer : ADAM
opt = 'adam'
n_epochs = 2
siam_model = fit_model(opt, n_epochs, contrastive_loss)
test_accuracy_adm = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 94.72%
* Accuracy on test set: 94.45%


In [None]:
# optimizer : RMSprop
opt = 'RMSprop'
n_epochs = 2
siam_model = fit_model(opt, n_epochs, contrastive_loss)
test_accuracy_rmsprop = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 95.96%
* Accuracy on test set: 95.75%


In [None]:
# optimizer : Mini-Batch Gradient Descent
opt = 'SGD' # batch-size is given in "fit" method
n_epochs = 2
siam_model = fit_model(opt, n_epochs, contrastive_loss)
test_accuracy_mbgd = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 96.54%
* Accuracy on test set: 96.14%


In [None]:
import pandas as pd
data = [['ADAM', test_accuracy_adm], ['RMSProp', test_accuracy_rmsprop], ['Mini-Batch Gradient Descent', test_accuracy_mbgd]]
df = pd.DataFrame(data, columns = ['Optimizer', 'Accuracy'])
# print dataframe.
print(df)

                     Optimizer  Accuracy
0                         ADAM  0.944501
1                      RMSProp  0.957464
2  Mini-Batch Gradient Descent  0.961448


**REASON :**

We see that **"Adam" optimizer performs relatively better than "RMSProp" and "Mini-Batch Gradient Descent"**. This is because Adam (adaptive moment estimation), also uses past learning rates like AdaGrad and RMSProp do.
Adam is a replacement optimization algorithm for stochastic gradient descent for training deep learning models.
Adam is relatively easy to configure where the default configuration parameters do well on most problems.

**Hyper Parameter Optimization** 

---
Trying to improve the accuracy of "Mini-Batch Gradient Descent" model by :


*   Increasing epochs
*   Using learning rate
*   Using decay rate
*   Using momentum



In [None]:
# optimizer : Mini-Batch Gradient Descent
epochs=2
learning_rate = 0.02
decay_rate = learning_rate / epochs
momentum = 0.8

opt = keras.optimizers.SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
siam_model = fit_model(opt, epochs, contrastive_loss)
test_accuracy_mbgd = predict(siam_model)

Epoch 1/2
Epoch 2/2
* Accuracy on training set: 88.39%
* Accuracy on test set: 88.44%


In [None]:
# optimizer : Mini-Batch Gradient Descent
epochs=10
learning_rate = 0.04
decay_rate = learning_rate / (1.5*epochs)
momentum = 0.8

opt = keras.optimizers.SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
siam_model = fit_model(opt, epochs, contrastive_loss)
test_accuracy_mbgd = predict(siam_model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
* Accuracy on training set: 98.22%
* Accuracy on test set: 97.49%


# ***Improved Accuracy***
**We observe that the accuracy in test data increased from 96.1448% to 97.49%**

**Pros and Cons of Siamese Networks**

PROS :

*More Robust to class Imbalance*: With the aid of One-shot learning, given a few images per class is sufficient for Siamese Networks to recognize those images in the future

*Nice to an ensemble with the best classifier*: Given that its learning mechanism is somewhat different from Classification, simple averaging of it with a Classifier can do much better than average 2 correlated Supervised models (e.g. GBM & RF classifier)

*Learning from Semantic Similarity*: Siamese focuses on learning embeddings (in the deeper layer) that place the same classes/concepts close together. Hence, can learn semantic similarity.

**CONS:**

*Needs more training time than normal networks:* Since Siamese Networks involves quadratic pairs to learn from (to see all information available) it is slower than normal classification type of learning(pointwise learning)

Doesn’t output probabilities: Since training involves pairwise learning, it won’t output the probabilities of the prediction, but the distance from each class