# Exercise 4 - LSTM
Shahar Michaeli


## Imports and Configurations

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
!pip install tensorboardX
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# !git clone https://github.com/pbloem/language-models.git
!git clone https://github.com/GuyKabiri/language_models

Collecting tensorboardX
  Downloading tensorboardX-2.4.1-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 4.0 MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.4.1
Found GPU at: /device:GPU:0
Cloning into 'language_models'...
remote: Enumerating objects: 106, done.[K
remote: Total 106 (delta 0), reused 0 (delta 0), pack-reused 106[K
Receiving objects: 100% (106/106), 19.38 MiB | 8.82 MiB/s, done.
Resolving deltas: 100% (55/55), done.


In [None]:
import keras
import keras.backend as K
from keras.datasets import imdb
from keras.layers import  LSTM, Embedding, TimeDistributed, Input, Dense
from keras.models import Model
from tensorflow.python.client import device_lib
from tqdm import tqdm
import os, random
from argparse import ArgumentParser
import numpy as np
from tensorboardX import SummaryWriter
from language_models import util
import string
from copy import deepcopy
CHECK = 5

## Helper Functions

In [None]:
def generate_seq(model : Model, seed, size, temperature=1.0):
    """
    :param model: The complete RNN language model
    :param seed: The first few words of the sequence to start generating from
    :param size: The total size of the sequence to generate
    :param temperature: This controls how much we follow the probabilities provided by the network. For t=1.0 we just
        sample directly according to the probabilities. Lower temperatures make the high-probability words more likely
        (providing more likely, but slightly boring sentences) and higher temperatures make the lower probabilities more
        likely (resulting in weirder sentences). For temperature=0.0, the generation is _greedy_, i.e. the word with the
        highest probability is always chosen.
    :return: A list of integers representing a samples sentence
    """

    ls = seed.shape[0] # Length of seed

    # Due to the way Keras RNNs work, we feed the model a complete sequence each time. At first it's just the seed,
    # zero-padded to the right length. With each iteration we sample and set the next character.

    tokens = np.concatenate([seed, np.zeros(size - ls)]) # Padding for the rest of the sentence
    for i in range(ls, size):
        probs = model.predict(tokens[None,:])

        # Extract the i-th probability vector and sample an index from it
        next_token = util.sample_logits(probs[0, i-1, :], temperature=temperature)
        tokens[i] = next_token

    return [int(t) for t in tokens]

In [None]:
def sparse_loss(y_true, y_pred):
  return K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

def encode(seq,w2i):
  words = [word.lower() for word in seq.split(' ')] 
  return np.array([w2i[word] if w2i.get(word) is not None else w2i['<UNK>'] for word in words])

def decode(seq):
  return ' '.join(i2w[id] for id in seq)

In [None]:
from scipy.special import softmax
def calculate_prob_of_sentence(model : Model, sentence,w2i):
    seq_encoded = encode(sentence,w2i)

    prob = 1
    tokens = np.concatenate([[seq_encoded[0]], np.zeros(len(seq_encoded) - 1)]) 
    for i in range(0, len(seq_encoded)):
      probs = model.predict(tokens[None,:])
      word_i_prob = softmax(softmax(probs[0, i-1, :]))[int(tokens[i])] 
      prob *= word_i_prob
      tokens[i] = seq_encoded[i]

    return prob

### Defining Hyper-Parameters For Each Model

In [None]:
class Args:
  epochs = 20 # Number of epochs
  embedding_size = 300 # Size of the word embeddings on the input layer.
  out_every = 1 # Output every n epochs.
  lr = 0.001 # Learning rate
  batch = 128 # Batch size
  task = 'wikisimple'
  data = './data' # Data file. Should contain one sentence per line.
  lstm_capacity = 256
  max_length = None # Sentence max length.
  top_words = 10000 # Word list size.
  limit = None # Character cap for the corpus - not relevant in our exercise.
  tb_dir = './runs/words' # Tensorboard directory
  seed = 3 # RNG seed. Negative for random (seed is printed for reproducability).
  extra = None # Number of extra LSTM layers.
  reverse = False
  name = ""

options1 = Args() 
options1.name = "1 LSTM Layer without Reverse"
options2 = deepcopy(options1)
options2.reverse = True
options2.name = "1 LSTM Layer with Reverse"
options3 = deepcopy(options1) 
options3.extra = 1 
options3.name = "2 LSTM Layers without Reverse"
options4 = deepcopy(options2) 
options4.extra = 1 
options4.name = "2 LSTM Layers with Reverse"
options4.reverse = True


options_list = [options1, options2, options3, options4] 

### Loading Data

In [None]:
def split_data(data,train_size=0.8,val_size=0.1,test_size=0.1):
  random.shuffle(data)
  train_len = int(train_size*len(x))
  val_len = int((train_size+val_size)*len(x))
  return data[:train_len], data[train_len:val_len],data[val_len:]

In [None]:
if options_list[0].seed < 0: # Same for all options..
    seed = random.randint(0, 1000000)
    print('random seed: ', seed)
    np.random.seed(seed)
else:
    np.random.seed(options_list[0].seed)


x, w2i, i2w = util.load_words(util.DIR + '/datasets/wikisimple.txt', vocab_size=options_list[0].top_words, limit=options_list[0].limit)
x_max_len = max([len(sentence) for sentence in x])
numwords = len(i2w)
print('max sequence length ', x_max_len)
print(numwords, 'distinct words')
train_data,val_data,test_data = split_data(x)
train_x = util.batch_pad(train_data, options_list[0].batch, add_eos=True) # Batching the train data
val_x = util.batch_pad(val_data, options_list[0].batch, add_eos=True) # Batching the validation data
test_x = util.batch_pad(test_data, options_list[0].batch, add_eos=True) # Batching the test data

raw data read
max sequence length  132
10000 distinct words
max length per batch:  [15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 37, 38, 38, 39, 39, 40, 41, 41, 42, 44, 45, 46, 47, 49, 52, 55, 62, 133]
max length per batch:  [17, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 27, 28, 30, 32, 34, 37, 42, 56, 88]
max length per batch:  [17, 18, 19, 19, 20, 20, 21,

### Create Models for Each Options

In [None]:
def createModel(options):
  input = Input(shape=(None, ))
  embedding = Embedding(numwords, options.embedding_size, input_length=None)
  embedded = embedding(input)

  decoder_lstm = LSTM(options.lstm_capacity, return_sequences=True, go_backwards=options.reverse)
  h = decoder_lstm(embedded)

  if options.extra is not None:
      for _ in range(options.extra):
          h = LSTM(options.lstm_capacity, return_sequences=True)(h)

  fromhidden = Dense(numwords, activation='linear')
  out = TimeDistributed(fromhidden)(h)

  model = Model(input, out)

  opt = tf.keras.optimizers.Adam(learning_rate=options.lr)
  lss = sparse_loss

  model.compile(opt, lss)
  return model


In [None]:
models = [createModel(options) for options in options_list]

### Training

In [None]:
def train(model,options,train_data):
  epoch = 0
  instances_seen = 0
  
  while epoch < options.epochs:
      for batch in tqdm(train_data):
          n, l = batch.shape

          batch_shifted = np.concatenate([np.ones((n, 1)), batch], axis=1)  # prepend start symbol
          batch_out = np.concatenate([batch, np.zeros((n, 1))], axis=1)     # append pad symbol

          loss = model.train_on_batch(batch_shifted, batch_out[:, :, None]) # sum_over_batch_size - mean of mean for each class - [Sum over all samples {(Sum of differences between y_pred and y_target vector of each sample / No of element in y_target of the sample )}] / Batch_size

          instances_seen += n
          # tbw.add_scalar('lm/batch-loss', float(loss), instances_seen)
      print(loss)
      epoch += 1



In [None]:
for i, model in enumerate(models):
  print(f"\nTraining model {i + 1}, Type: {options_list[i].name}\n")
  train(model, options_list[i],train_x)
  print("-----------------------------------------------\n\n")


Training model 1, Type: 1 LSTM Layer without Reverse



100%|██████████| 186/186 [00:26<00:00,  7.13it/s]


4.085677146911621


100%|██████████| 186/186 [00:11<00:00, 16.00it/s]


3.6336264610290527


100%|██████████| 186/186 [00:11<00:00, 15.77it/s]


3.5214264392852783


100%|██████████| 186/186 [00:11<00:00, 15.98it/s]


3.4688212871551514


100%|██████████| 186/186 [00:11<00:00, 15.91it/s]


3.407072067260742


100%|██████████| 186/186 [00:11<00:00, 15.84it/s]


3.36873197555542


100%|██████████| 186/186 [00:12<00:00, 15.45it/s]


3.3429222106933594


100%|██████████| 186/186 [00:11<00:00, 15.74it/s]


3.3400943279266357


100%|██████████| 186/186 [00:11<00:00, 15.71it/s]


3.286438465118408


100%|██████████| 186/186 [00:11<00:00, 15.82it/s]


3.236335515975952


100%|██████████| 186/186 [00:11<00:00, 15.65it/s]


3.2047863006591797


100%|██████████| 186/186 [00:11<00:00, 15.75it/s]


3.1972529888153076


100%|██████████| 186/186 [00:11<00:00, 15.68it/s]


3.1585071086883545


100%|██████████| 186/186 [00:11<00:00, 15.83it/s]


3.134404420852661


100%|██████████| 186/186 [00:11<00:00, 15.68it/s]


3.114643096923828


100%|██████████| 186/186 [00:11<00:00, 15.77it/s]


3.1019845008850098


100%|██████████| 186/186 [00:11<00:00, 15.54it/s]


3.061394453048706


100%|██████████| 186/186 [00:11<00:00, 15.64it/s]


3.084258794784546


100%|██████████| 186/186 [00:11<00:00, 15.72it/s]


3.0581746101379395


100%|██████████| 186/186 [00:11<00:00, 15.67it/s]


3.0297162532806396
-----------------------------------------------



Training model 2, Type: 1 LSTM Layer with Reverse



100%|██████████| 186/186 [00:16<00:00, 11.35it/s]


5.57320499420166


100%|██████████| 186/186 [00:11<00:00, 15.80it/s]


5.993616580963135


100%|██████████| 186/186 [00:11<00:00, 15.77it/s]


5.503299713134766


100%|██████████| 186/186 [00:11<00:00, 15.63it/s]


5.188401699066162


100%|██████████| 186/186 [00:11<00:00, 15.61it/s]


4.836703300476074


100%|██████████| 186/186 [00:11<00:00, 15.54it/s]


4.575553894042969


100%|██████████| 186/186 [00:12<00:00, 15.23it/s]


4.674558162689209


100%|██████████| 186/186 [00:12<00:00, 14.76it/s]


4.356936931610107


100%|██████████| 186/186 [00:12<00:00, 15.27it/s]


4.50731086730957


100%|██████████| 186/186 [00:12<00:00, 15.07it/s]


4.934467315673828


100%|██████████| 186/186 [00:12<00:00, 14.90it/s]


4.313580513000488


100%|██████████| 186/186 [00:12<00:00, 14.37it/s]


4.307985782623291


100%|██████████| 186/186 [00:12<00:00, 15.50it/s]


4.340573787689209


100%|██████████| 186/186 [00:12<00:00, 15.39it/s]


4.3197150230407715


100%|██████████| 186/186 [00:12<00:00, 15.50it/s]


4.437882423400879


100%|██████████| 186/186 [00:12<00:00, 15.32it/s]


4.197131156921387


100%|██████████| 186/186 [00:12<00:00, 15.26it/s]


4.394332408905029


100%|██████████| 186/186 [00:12<00:00, 15.38it/s]


4.058804988861084


100%|██████████| 186/186 [00:12<00:00, 15.03it/s]


3.999204635620117


100%|██████████| 186/186 [00:12<00:00, 15.45it/s]


3.926196575164795
-----------------------------------------------



Training model 3, Type: 2 LSTM Layers without Reverse



100%|██████████| 186/186 [00:22<00:00,  8.26it/s]


5.1035027503967285


100%|██████████| 186/186 [00:14<00:00, 12.81it/s]


5.008177757263184


100%|██████████| 186/186 [00:14<00:00, 13.04it/s]


4.868311882019043


100%|██████████| 186/186 [00:14<00:00, 13.13it/s]


4.999430179595947


100%|██████████| 186/186 [00:14<00:00, 13.13it/s]


4.354943752288818


100%|██████████| 186/186 [00:14<00:00, 13.15it/s]


4.014673233032227


100%|██████████| 186/186 [00:14<00:00, 13.01it/s]


3.8791215419769287


100%|██████████| 186/186 [00:14<00:00, 13.18it/s]


3.8351285457611084


100%|██████████| 186/186 [00:14<00:00, 13.14it/s]


3.832550525665283


100%|██████████| 186/186 [00:14<00:00, 13.17it/s]


3.6989619731903076


100%|██████████| 186/186 [00:14<00:00, 13.13it/s]


3.667938709259033


100%|██████████| 186/186 [00:14<00:00, 13.16it/s]


3.653604507446289


100%|██████████| 186/186 [00:14<00:00, 13.04it/s]


3.6253721714019775


100%|██████████| 186/186 [00:14<00:00, 13.18it/s]


3.5792863368988037


100%|██████████| 186/186 [00:14<00:00, 13.05it/s]


3.5390210151672363


100%|██████████| 186/186 [00:14<00:00, 12.97it/s]


3.5512499809265137


100%|██████████| 186/186 [00:14<00:00, 13.02it/s]


3.4846174716949463


100%|██████████| 186/186 [00:14<00:00, 13.00it/s]


3.462157964706421


100%|██████████| 186/186 [00:14<00:00, 13.07it/s]


3.468632459640503


100%|██████████| 186/186 [00:14<00:00, 13.12it/s]


3.4160122871398926
-----------------------------------------------



Training model 4, Type: 2 LSTM Layers with Reverse



100%|██████████| 186/186 [00:23<00:00,  8.07it/s]


5.14104700088501


100%|██████████| 186/186 [00:14<00:00, 13.07it/s]


5.076297760009766


100%|██████████| 186/186 [00:14<00:00, 12.89it/s]


4.931604862213135


100%|██████████| 186/186 [00:14<00:00, 12.79it/s]


4.941627025604248


100%|██████████| 186/186 [00:14<00:00, 12.88it/s]


5.009622097015381


100%|██████████| 186/186 [00:14<00:00, 12.96it/s]


5.015004634857178


100%|██████████| 186/186 [00:14<00:00, 13.04it/s]


5.058128833770752


100%|██████████| 186/186 [00:14<00:00, 12.81it/s]


5.085804462432861


100%|██████████| 186/186 [00:14<00:00, 13.10it/s]


5.1164116859436035


100%|██████████| 186/186 [00:14<00:00, 13.02it/s]


5.181401252746582


100%|██████████| 186/186 [00:14<00:00, 12.98it/s]


5.0715508460998535


100%|██████████| 186/186 [00:14<00:00, 12.93it/s]


5.025958061218262


100%|██████████| 186/186 [00:14<00:00, 13.07it/s]


4.787221431732178


100%|██████████| 186/186 [00:14<00:00, 12.97it/s]


4.6461262702941895


100%|██████████| 186/186 [00:14<00:00, 12.96it/s]


4.844838619232178


100%|██████████| 186/186 [00:14<00:00, 12.96it/s]


4.569586753845215


100%|██████████| 186/186 [00:14<00:00, 12.92it/s]


4.220826625823975


100%|██████████| 186/186 [00:14<00:00, 12.95it/s]


4.30557107925415


100%|██████████| 186/186 [00:14<00:00, 12.95it/s]


4.115988254547119


100%|██████████| 186/186 [00:14<00:00, 12.84it/s]

4.0285749435424805
-----------------------------------------------







## Preplexity based on Cross Entropy 
I used these articles [The relationship between Perplexity and Entropy in NLP](https://towardsdatascience.com/the-relationship-between-perplexity-and-entropy-in-nlp-f81888775ccc), [Perplexity in Language Models](https://towardsdatascience.com/perplexity-in-language-models-87a196019a94) and [Perplexity](https://en.wikipedia.org/wiki/Perplexity) wikipedia page to define the equation. 

In [None]:
def perplexity(model,data):
  loss = 0
  count = 0
  for batch in tqdm(data):
    n, l = batch.shape

    batch_shifted = np.concatenate([np.ones((n, 1)), batch], axis=1)  # prepend start symbol
    batch_out = np.concatenate([batch, np.zeros((n, 1))], axis=1)     # append pad symbol

    loss += model.evaluate(batch_shifted, batch_out[:, :, None],verbose=0) # sum_over_batch_size - mean of mean for each class - [Sum over all samples {(Sum of differences between y_pred and y_target vector of each sample / No of element in y_target of the sample )}] / Batch_size
    count += 1 # count batch size
  return 2**(loss/count) # return the mean perplexity for all batches in dataset.



In [None]:
res = 0
for i, model in enumerate(models):
  print(f"*** Mean Perplexity On Model #{i + 1}, Type: {options_list[i].name} *** ")
  print("Mean Perpelxity on Train:", perplexity(model, train_x), "")
  print("Mean Perpelxity on Validation:", perplexity(model, val_x), "")
  print("Mean Perpelxity on Test:", perplexity(model, test_x), "")
  print("-----------------------------------------------")

*** Mean Perplexity On Model #1, Type: 1 LSTM Layer without Reverse *** 


100%|██████████| 186/186 [00:23<00:00,  8.02it/s]


Mean Perpelxity on Train: 17.353149501403344 


100%|██████████| 24/24 [00:02<00:00,  8.75it/s]


Mean Perpelxity on Validation: 22.569151613170135 


100%|██████████| 24/24 [00:02<00:00,  9.04it/s]


Mean Perpelxity on Test: 21.87317105854881 
-----------------------------------------------
*** Mean Perplexity On Model #2, Type: 1 LSTM Layer with Reverse *** 


100%|██████████| 186/186 [00:23<00:00,  7.98it/s]


Mean Perpelxity on Train: 36.529504191141605 


100%|██████████| 24/24 [00:02<00:00,  9.00it/s]


Mean Perpelxity on Validation: 50.82263765030222 


100%|██████████| 24/24 [00:02<00:00,  9.19it/s]


Mean Perpelxity on Test: 49.214314395915615 
-----------------------------------------------
*** Mean Perplexity On Model #3, Type: 2 LSTM Layers without Reverse *** 


100%|██████████| 186/186 [00:29<00:00,  6.29it/s]


Mean Perpelxity on Train: 42.98207399526106 


100%|██████████| 24/24 [00:03<00:00,  7.45it/s]


Mean Perpelxity on Validation: 40.18241899530519 


100%|██████████| 24/24 [00:03<00:00,  7.27it/s]


Mean Perpelxity on Test: 39.01121797010973 
-----------------------------------------------
*** Mean Perplexity On Model #4, Type: 2 LSTM Layers with Reverse *** 


100%|██████████| 186/186 [00:29<00:00,  6.32it/s]


Mean Perpelxity on Train: 75.17726010657634 


100%|██████████| 24/24 [00:02<00:00,  8.25it/s]


Mean Perpelxity on Validation: 74.27502421370822 


100%|██████████| 24/24 [00:02<00:00,  8.32it/s]

Mean Perpelxity on Test: 72.49408848243262 
-----------------------------------------------





In [None]:
# Part 6 Generating sentences and calculating probabilities 
sentence = "I love"
seed = encode(sentence,w2i)
seed = np.insert(seed, 0, 1)
for temp in [0, 1, 10]:
  print('EMP ', temp)
  for i, model in enumerate(models):
    print(f"\tModel #{i}, name: {options_list[i].name}")
    gen = generate_seq(models[0], seed,  8, temperature=temp)
    start_sen = decode(seed)
    end_sen = decode(gen[len(seed):])
    print("\t\tGenerated Sentence:",start_sen + ' ' + end_sen)
    print("\t\tProbability:", calculate_prob_of_sentence(models[0],start_sen + ' ' + end_sen,w2i), "\n")

# Part 9
for i, model in enumerate(models):
  print(f"Model #{i}, name: {options_list[i].name}")
  print("\tSentence:", "<START> I love cupcakes")
  print("\tProbability:", calculate_prob_of_sentence(model, "I love cupcakes", w2i), "\n")

EMP  0
	Model #0, name: 1 LSTM Layer without Reverse
		Generated Sentence: <START> i love lrb <UNK> rrb is a
		Probability: 1.0064280845696018e-32 

	Model #1, name: 1 LSTM Layer with Reverse
		Generated Sentence: <START> i love lrb <UNK> rrb is a
		Probability: 1.0064280845696018e-32 

	Model #2, name: 2 LSTM Layers without Reverse
		Generated Sentence: <START> i love lrb <UNK> rrb is a
		Probability: 1.0064280845696018e-32 

	Model #3, name: 2 LSTM Layers with Reverse
		Generated Sentence: <START> i love lrb <UNK> rrb is a
		Probability: 1.0064280845696018e-32 

EMP  1
	Model #0, name: 1 LSTM Layer without Reverse
		Generated Sentence: <START> i love the <UNK> <UNK> husky lrb
		Probability: 1.007162582375041e-32 

	Model #1, name: 1 LSTM Layer with Reverse
		Generated Sentence: <START> i love played the which must identical
		Probability: 1.0114295224984641e-32 

	Model #2, name: 2 LSTM Layers without Reverse
		Generated Sentence: <START> i love lrb <UNK> yellow rrb also
		Probabilit

## Get The Next Word - UI

In [None]:
sentence = input('Enter a sentence : ')
seed = encode(sentence,w2i)
gen = generate_seq(models[0], seed, len(seed)+1, temperature=1.0)
print(f"The New Sentence is : {decode(gen)}")

Enter a sentence : he is a former English football
The New Sentence is : he is a former english football player
