In [1]:
# POPULAR METHOD TO GENERATE WORD EMBEDDINGS
# WORD2VEC MAPS WORDS TO A VECTOR SPACE OF GIVEN DIMENSION
# CALCULATIONS CAN BE PERFOMED TO FIND RELATIONSHIP BETWEEN THEM
# CBOW - CONTINUOUS BAG OF WORDS ( TARGET WORD PREDICTED FROM CONTEXT)
#        TAKES A SENTENCE, AND USES A WINDOW SIZE FOR ITERATING OVER THE SENTENCE
#        THEN WE TRY TO PREDIICT CENTRE WORD FROM THE CONTEXT WORDS
#        WE HAVE TO CHOOSE SIZE OF EACH VECTOR AS WELL
#        HIDDEN LAYER IS ALSO OF SAME SIZE
#        WE PASS THE ONE HOT VECTOR OF THE CONTEXT WORDS AND TRY TO PREDICT THE CENTRE WORD
#        THIS IS DONE USING NEURAL NETWORK AND ERROR BACKPROPAGATION
#        THE CONTEXT WORDS VECTORS SHARE THE PREDICTED WORD VECTOR AND HIDDEN LAYER
#        THE PROBABILITIES OF THE PREDICTED WORD ARE COMPARED WITH THE VECTOR OF THE TARGET WORD
#        THEN THE WEIGHTS ARE UPDATED.THE WINDOW SLIDES AND THE PROCESS CONTINUES
#        ONCE DONE.THESE UPDATED WEIGHTS BECOME OUR SET OF VECTORS
# SKIP GRAM - (CONTEXT WORDS PREDICED FROM TARGET WORD)
#            WE CHOOSE WINDOW SIZE AND GIVE CENTRE WORD AS INPUT
#            TRY TO PREDICT CONTEXT WORDS AND UPDATE WEIGHTS
#            HIDDEN LAYER SHARED BY THE CONTEXT WORDS
#            THEN WE TAKE THE INPUT WEIGHT MATRIX AND MULTIPLY IT WITH THE ONE HOT VECTOR TO OBTAIN ITS WORD VECTOR
# USUALLY WINDOW SIZE 5-10 VECTOR SIZE AROUND 300

In [43]:
import numpy as np
import pandas as pd
import keras

In [44]:
from sklearn.datasets import load_digits
digits=load_digits()

In [45]:
from keras.models import Sequential

In [46]:
from keras.layers.convolutional import Conv2D,MaxPooling2D

In [47]:
from keras.layers import Dense,Dropout,ReLU,Flatten

In [72]:
model=Sequential()

In [73]:
model.add(Conv2D(64,(3,3),padding='valid',input_shape=(8,8,1),activation='relu'))

In [74]:
model.add(MaxPooling2D(pool_size=(2,2)))

In [75]:
model.add(Conv2D(64,(2,2),activation='relu'))

In [76]:
model.add(MaxPooling2D(pool_size=(2,2)))

In [77]:
model.add(Flatten())

In [78]:
model.add(Dense(64,activation='relu'))
model.add(Dense(10,activation='softmax'))

In [79]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [80]:
df=digits.images
df_=df.reshape(len(df),8,8,1)

In [82]:
from sklearn.model_selection import train_test_split
x_train,x,y_train,y=train_test_split(df_,digits.target,random_state=42,test_size=.3,stratify=digits.target)

In [83]:
x_validate,x_test,y_validate,y_test=train_test_split(x,y,random_state=42,test_size=.5,stratify=y)

In [84]:
y=model.fit(x_train,y_train,epochs=10,validation_data=(x_validate,y_validate))

Train on 1257 samples, validate on 270 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [85]:
model.evaluate(x_test,y_test)



[0.08194828938554835, 0.9777777777777777]

In [86]:
model.evaluate(x_train,y_train)



[0.02606237841460353, 0.9976133651551312]

In [87]:
# Regularization is a process to reduce overfitting. It is a
# mathematical way of inducing a warning into the model’s learning process
# when it accommodates noise. To give a more realistic definition, it is a
# method to penalize the model weights in the event of overfitting.
# The process of regularization adds the weights of the edges to
# the defined loss function and holistically represents a higher loss. The
# network then tunes itself to reduce the loss and thereby makes the weight
# updates in the right direction; this works by ignoring the noise rather than
# accommodating it in the learning process.
# The process of regularization can be demonstrated as
# Cost Function = Loss (as defined for the model) + Hyperparameter × [Weights]
# L1 Regularization
# the absolute weights are added to the loss function. To
# make the model more generalized, the values of the weights are reduced
# to 0, and therefore this method is strongly preferred when we are trying to
# compress the model for faster computation.
# L2 Regularization
# In L2 regularization, the squared weights are added to the loss function. To
# make the model more generalized, the values of the weights are reduced to
# near 0 (but not actually 0), and hence this is also called the “weight decay”
# method. In most cases, L2 is highly recommended over L1 for reducing
# overfitting
# model.add(Dense(256, input_dim=128,
# kernel_regularizer=regularizers.l2(0.01)) where 0.01 is the hyperparameter lambda
# Dropout Regularization
# the model arbitrarily drops or deactivates a few neurons for a layer during each iteration.
# process repeats for each iteration with randomness
# efficient due to the reduced computation and works intuitively in reducing the overfitting
# Dropout(rate, noise_shape=None, seed=None)
# model.add(Dropout(0.25))

In [88]:
# HYPERPARAMETER TUNING
# i) NUMBER OF INPUT NODES
# simple rule of thumb for selecting the number of neurons in the first layer is to refer to the number of input dimensions.
# If the final number of input dimensions in a given training dataset (this includes the one-hot encoded features also) is x,
# we should use at least the closest number to 2x in the power of 2.
# ii)NUMBER OF LAYERS
# The problem is that with an increased number of layers, the training time and computation increase significantly.
# You would need a higher number of epochs to see promising results
# For the last hidden layer (not the output layer), try keeping the number of neurons to at least around 30–40% 
# of the input size.
# In case of large network use tapering size architecture (eg:1st 8-512; 2nd 8-256 ; 3rd 8-128 and so on...)
# For wider networks always use L2 Regularization
# iii)WEIGHT INITIALIZATION
# By default, the Keras framework uses glorot uniform initialization, also called Xavier uniform initialization.
# Other popular options to select are ‘He Normal’ and ‘He Uniform’ initialization and ‘lecun normal’ and ‘lecun uniform’
# initialization.
# model.add(Dense(64,activation="relu", input_dim = 32, kernel_initializer = "random_uniform",bias_initializer = "zeros"))
# iv)BATCH SIZE
# usually go for 32 or 64 as they give a smooth learning curve

In [92]:
# Skip Gram works well with small amount of data and is found to represent rare words well
# CBOW is faster and has better representations for more frequent words.

In [None]:
# model which uses L1 regularization technique is called Lasso regression
# model which uses L2 is called Ridge Regression.