# **Natural Language Processing:**
NLP is the field that focuses on how computers can understand and/or process natural/human languages.

# Recurrent Neural Networks
 It is a new kind of neural network that is much more capable of processing **sequential data** such as text or characters called a recurrent neural network.

Applications of RNN:

*   Sentiment Analysis
*   Character Generation



# **Bag of Words** : 
used to convert to numeric form
Very simple but not very accurate implementation.
It just contains the vocabulary of distinct words in a text and assign them a numeric value and keep track of their frequency.
Loses the ordering of the words in a text.

Bag of words fails to capture the meaning or the context of sentences. For instance:

**I thought the movie was going to be bad, but it was actually amazing!**

**I thought the movie was going to be amazing, but it was actually bad!**

Although these two setences are very similar we know that they have very different meanings. This is because of the ordering of words, a very important property of textual data.

In [1]:
vocab = {}  # maps word to integer representing it
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(" ")  # create a list of all of the words in the text, well assume there is no grammar in our text for this example
  bag = {}  # stores all of the encodings and their frequency

  for word in words:
    if word in vocab:
      encoding = vocab[word]  # get encoding from vocab
    else:
      vocab[word] = word_encoding
      encoding = word_encoding
      word_encoding += 1
    
    if encoding in bag:
      bag[encoding] += 1
    else:
      bag[encoding] = 1
  
  return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


Instance where Bag of Words fails to correctly classify the review

In [2]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print("Positive:", pos_bag)
print("Negative:", neg_bag)

Positive: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


Sentiment Analysis on Movie Dataset

In [3]:
%tensorflow_version 2.x 
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [4]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

# Preprocessing
Here, the reviews are of different lengths. We cannot pass different length data into the Neural Network.
Therefore, we must make each review the same length. To do this we will follow the procedure below:



*   If the review is greater than 250 words then trim off the extra words
*   If the review is less than 250 words add the necessary amount of 0's to make it equal to 250.

In [5]:
# Keras function for padding
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)
print(len(test_data))

25000


In [6]:
model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1,activation="sigmoid")
])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


# Training
Now it's time to compile and train the model.

In [8]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop",metrics = ['acc'])
history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2) # splitin the training data 80-20 for training and testing

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# evaluating the model
models = model.evaluate(test_data, test_labels)



# Making Predictions on unseen/new data

Since our reviews are encoded well need to convert any review that we write into that form so the network can understand it. To do that well load the encodings from the dataset and use them to encode our own data.



In [10]:
word_index = imdb.get_word_index() # get the lookup table used while training the dataset


def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text) # convert the words to tokens the smallest unit of data transfer in NLP
  tokens = [ word_index[word] if word in word_index else 0 for word in tokens] # if token is present in lookup keep it else replace it with 0
  return sequence.pad_sequences([tokens], MAXLEN)[0] # return the padded list of tokens, in here for now displaying only the first padded token


text = "that movie was just amazing, really amazing."
encoded = encode_text(text)
print(encoded)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0 

In [11]:
# Since we have successfulyy encoded it, let's decode the same set of integers to get back the original string

reverse_word_index = {value: key for(key,value) in word_index.items()} # the new word index should contain the integer encoding as key, and text as value

def decode_integers(integers):
  PAD=0
  text=""
  for num in integers:
    if num!=PAD:
      text+= reverse_word_index[num]+" ";
    
  return text[:-1] #return the entire sentence except the last space

print(decode_integers(encoded))

that movie was just amazing really amazing


In [12]:
  # now time to make a prediction

  def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1, 250)) # vector or array filled with zeroes with the shape (1, 250) , 250 because our review length is at max 250, we need to pass our input keeping the movie review length in mind
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])

positive_review = "That movie was amazing! really loved it and would great watch it again because it was great."
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)



[0.84257317]
[0.21013223]


##RNN Play Generator

Now time for one of the coolest examples we've seen so far. We are going to use a RNN to generate a play. We will simply show the RNN an example of something we want it to recreate and it will learn how to write a version of it on its own. We'll do this using a character predictive model that will take as input a variable length sequence and predict the next character. We can use the model many times in a row with the output from the last predicition as the input for the next call to generate a sequence.

In [13]:
%tensorflow_version 2.x
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np


In [14]:
# Dataset: Using an extract from a shakespeare play

path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


###Loading Your Own Data
To load your own data, you'll need to upload a file from the dialog below. Then you'll need to follow the steps from above but load in this new file instead.

In [15]:
# from google.colab import files
# path_to_file2 = list(files.upload().keys())[0]

###Read Contents of File
Let's look at the contents of the file.

In [16]:
# Read, then decode for py2 compatable format

text = open(path_to_file, 'rb').read().decode(encoding = 'utf-8')
# Length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))


Length of text: 1115394 characters


In [17]:
# Taking a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



###Encoding
Since this text isn't encoded yet well need to do that ourselves. We are going to encode each unique character as a different integer.


In [18]:
vocab = sorted(set(text))
# Create a mapping from unique characters to indices

# for i,u in enumerate(vocab):
#   print(str(i)+" "+u)

char2idx = {u:i for i,u in enumerate(vocab)} # save it as value: index pair
print(char2idx)
idx2char = np.array(vocab) # given to us
print(idx2char)

def text_to_int(text):  # convert the text into integer char by char
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [19]:
# Quick look at how a part of the text is encoded
print("Text: ", text[:13])
print("Encoded: ", text_to_int(text[:13]))

Text:  First Citizen
Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [20]:
# Extra a function to convert the numeric values to text
def int_to_text(ints):
  try:
    # ensure the integer is a list of arrays
    ints = ints.numpy()

  except:
    pass
    return " ".join(idx2char[ints])

  print(int_to_text(text_to_int(text[:15])))

###Creating Training Examples
Remember our task is to feed the model a sequence and have it return to us the next character. This means we need to split our text data from above into many shorter sequences that we can pass to the model as training examples. 

The training examples we will prepapre will use a *seq_length* sequence as input and a *seq_length* sequence as the output where that sequence is the original sequence shifted one letter to the right. For example:

```input: Hell | output: ello```

Our first step will be to create a stream of characters from our text data.

In [21]:
# creating sequences
seq_length = 100 #Length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1) # coz for a seq of len 100, we are predicting the output as 101th char, next sequence starts from 102 position

# Creating training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [22]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True) # creating batches of sequences for training and testing, and say if we 105 chars, and one batch is 101, we can create one batch and drop the remaining 4 chars


Now we need to use these sequences of length 101 and split them into input and output.



In [23]:
def split_input_target(chunk): # for the example: hello
    input_text = chunk[:-1] # all characters except the last one as: hell 
    target_text = chunk[1: ] #starting from the second char: ello
    return input_text, target_text # hell, ello

dataset = sequences.map(split_input_target) # we use map to apply the above function to every entry

In [24]:
for x,y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
F i r s t   C i t i z e n : 

None

OUTPUT
F i r s t   C i t i z e n : 

None


EXAMPLE

INPUT
F i r s t   C i t i z e n : 

None

OUTPUT
F i r s t   C i t i z e n : 

None


Finally we need to make training batches.

In [25]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab) # vocab is the number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)


###Building the Model
Now it is time to build the model. We will use an embedding layer a LSTM and one dense layer that contains a node for each unique character in our training data. The dense layer will give us a probability distribution over all nodes.

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape =[batch_size, None]),
      tf.keras.layers.LSTM(rnn_units, 
                           return_sequences=True,
                           stateful=True,
                           recurrent_initializer ='glorot_uniform'),
      tf.keras.layers.Dense(vocab_size)
  ])

  return model

  model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
  model.summary()

###Creating a Loss Function
Now we are going to create our own loss function for this problem. This is because our model will output a (64, sequence_length, 65) shaped tensor that represents the probability distribution of each character at each timestep for every sequence in the batch. 

However, before we do that let's have a look at a sample input and the output from our untrained model. This is so we can understand what the model is giving us.

In [27]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)  # asking our model for a prediction on our first batch of training data (64 entries)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")  # print out the output shape

(64, 1) # (batch_size, sequence_length, vocab_size)


In [28]:
# we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[0.5873356 ]
 [0.01302982]
 [0.45807937]
 [0.293234  ]
 [0.10850812]
 [0.4098307 ]
 [0.21393013]
 [0.42327464]
 [0.14374235]
 [0.79076946]
 [0.0727241 ]
 [0.05004625]
 [0.10504515]
 [0.38153136]
 [0.00250143]
 [0.28558058]
 [0.026544  ]
 [0.7227878 ]
 [0.56315774]
 [0.6308337 ]
 [0.00437637]
 [0.01628184]
 [0.54029995]
 [0.38616526]
 [0.36530066]
 [0.18527201]
 [0.26341587]
 [0.2885506 ]
 [0.04039811]
 [0.22783206]
 [0.17417468]
 [0.34492415]
 [0.1992487 ]
 [0.3064144 ]
 [0.45019743]
 [0.31051287]
 [0.01514038]
 [0.40109664]
 [0.4705461 ]
 [0.1917153 ]
 [0.01176617]
 [0.09259679]
 [0.41080594]
 [0.7255513 ]
 [0.8973392 ]
 [0.54077715]
 [0.19179335]
 [0.5077768 ]
 [0.58573776]
 [0.48261404]
 [0.47749743]
 [0.26879132]
 [0.3824802 ]
 [0.13805911]
 [0.01339161]
 [0.38233545]
 [0.06661414]
 [0.68933606]
 [0.06153879]
 [0.39788797]
 [0.01983021]
 [0.57793033]
 [0.234968  ]
 [0.56597793]], shape=(64, 1), dtype=float32)


In [29]:
# lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))

print(pred)

# notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

1
tf.Tensor([0.5873356], shape=(1,), dtype=float32)


In [30]:
# and finally well look at a prediction at the first timestep
time_pred = pred[0]
# print(len(time_pred))
print(time_pred)
# and of course its 65 values representing the probabillity of each character occuring next

tf.Tensor(0.5873356, shape=(), dtype=float32)


In [31]:
# If we want to determine the predicted character we need to sample the output distribution( pick a value based on probability)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices,(1,-1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  # and this is what the model predicted for training sequence 1

InvalidArgumentError: ignored

So now we need to create a loss function that can compare that output to the expected output and give us some numeric value representing how close the two were.

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
  

###Compiling the Model
At this point we can think of our problem as a classification problem where the model predicts the probabillity of each unique letter coming next. 


In [None]:
model.compile(optimizer='adam', loss=loss)

###Creating Checkpoints
Now we are going to setup and configure our model to save checkpoinst as it trains. This will allow us to load our model from a checkpoint and continue training it.

In [None]:
#Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only=True
)

###Training
Finally, we will start training the model. 

**If this is taking a while go to Runtime > Change Runtime Type and choose "GPU" under hardware accelerator.**

In [None]:
history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

###Loading the Model
We'll rebuild the model from a checkpoint using a batch_size of 1 so that we can feed one peice of text to the model and have it make a prediction.

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1) # building the model with batch_size=1 by calling user defined function build_model()


Once the model is finished training, we can find the lastest checkpoint that stores the models weights using the following line.

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

We can load **any checkpoint** we want by specifying the exact file to load.

In [None]:
checkpoint_num = 10
model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_"+str(checkpoint_num)))
model.build(tf.TensorShape([1, None]))

###Generating Text
Now we can use the lovely function provided by tensorflow to generate some text using any starting string we'd like.

In [None]:
def generate_text(model, start_string):
  #Evaluation step (generating text using the learned model)

  #Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers aka vectorizing
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0) #expanding a 1D array to a nested 2D array

  # Empty string to store our results
  text_generated = []

  #Low temperatures results in more predictable text.
  #Higher temperatures results in more surprising text.
  #Experiment to find the best setting.
  temperature = 1.0

# Here batch size == 1
  model.reset_states()

  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension

    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ' '.join(text_generated))



    

In [None]:
inp = input("Type a starting string: ")
print(generated_tex(model, inp))

And that's pretty much it for this module!