<a href="https://colab.research.google.com/github/shubhu1026/AI-ML/blob/main/NLP_Ass9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Name:** Shubham Patel  
**Student No.:** n01624539

# Import the libraries

In [None]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.optimizers import Adam
import tensorflow as tf

# Define the parameters

In [None]:
vocab_size = 3000
embedding_size = 50
hidden_size = 25

# Read the data and create input and target sentences

In [None]:
X = [] # input text
y = [] # target text
for line in open('/content/YiLei_Poem.txt'):
  line = line.rstrip()
  if not line:
    continue

  input_line = '<start> ' + line
  target_line = line + ' <end>'

  X.append(input_line)
  y.append(target_line)

total_lines = X + y

# Tokenization

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, filters='') # Here we donot want to filter anything therefore filter = " "
                                                            # means that filter is empty string. This will ensure the angle
                                                            # signs of our tokens will retain.
tokenizer.fit_on_texts(total_lines)
input_sequences = tokenizer.texts_to_sequences(X)
target_sequences = tokenizer.texts_to_sequences(y)

# Get the sequence length

In [None]:
seq_len = max(len(s) for s in input_sequences)
print('Maximum seq length:', seq_len)

Maximum seq length: 8


# Word2index mapping

In [None]:
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
assert('<start>' in word2idx)
assert('<end>' in word2idx)

Found 122 unique tokens.


# Padding the sequence to get N x T

In [None]:
input_sequences = pad_sequences(input_sequences, maxlen = seq_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen = seq_len, padding='post')
print('Data Shape:', input_sequences.shape)

Data Shape: (34, 8)


# Create one hot of the targets as we cannot use sparse cross entropy in keras because we have t targets for each input

In [None]:
one_hot_targets = np.zeros((len(input_sequences), seq_len, vocab_size))
for i, target_sequence in enumerate(target_sequences):
  for t, word in enumerate(target_sequence):
    if word > 0:
      one_hot_targets[i, t, word] = 1

# Create an LSTM Model

In [None]:
input_ = Input(shape=(seq_len,)) # input sequence
h_i = Input(shape=(hidden_size,))       # hidden state
c_i = Input(shape=(hidden_size,))       # cell state
# we pass initial states and cell states because we want to control them. we dont want keras to initialize them randomly
# because we want consistency.
embedding_layer = Embedding(vocab_size, embedding_size , input_length = seq_len)
x = embedding_layer(input_)
lstm = LSTM(hidden_size, return_sequences=True, return_state=True) # return_sequences=True because we need sequences
                                                                  # return_state=True, we need states later
x, _, _ = lstm(x, [h_i, c_i]) # only need x here
dense = Dense(vocab_size, activation='softmax')
output = dense(x)
model = Model([input_, h_i, c_i], output)

# Compile the model

In [None]:
model.compile( loss='categorical_crossentropy', optimizer=Adam(learning_rate = 0.01), metrics=['accuracy'])

# Here accuracy is uninterpretable because there are so many words that comes after the particular word.

# Train the model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

hidden_state = np.zeros((len(input_sequences), hidden_size)) # Creating initial hidden state
cell_state = np.zeros((len(input_sequences), hidden_size))

early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=300,         # Stop if val_loss doesn't improve for 10 epochs
    restore_best_weights=True,  # Restore best weights after stopping
    verbose=1
)

hist = model.fit([input_sequences, hidden_state, cell_state], one_hot_targets,
  batch_size = 64,
  epochs = 3000, # train for 3000 epochs
  validation_split = 0.2,
  callbacks=[early_stopping]
)


Epoch 1/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 5.5600 - val_accuracy: 0.1250 - val_loss: 6.5691
Epoch 2/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - accuracy: 0.1250 - loss: 5.5434 - val_accuracy: 0.1250 - val_loss: 6.5588
Epoch 3/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step - accuracy: 0.1250 - loss: 5.5207 - val_accuracy: 0.1250 - val_loss: 6.5425
Epoch 4/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step - accuracy: 0.1250 - loss: 5.4841 - val_accuracy: 0.1250 - val_loss: 6.5140
Epoch 5/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step - accuracy: 0.1250 - loss: 5.4209 - val_accuracy: 0.1250 - val_loss: 6.4635
Epoch 6/3000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step - accuracy: 0.1250 - loss: 5.3121 - val_accuracy: 0.1250 - val_loss: 6.3788
Epoch 7/3000
[1m1/1[0m [

## Make Text Generator Model for prediction. For genearting text, we need to pass one sample at a time.We use same layers which we used early. If we create a new layers then wieghts will be initialized randomly. so we have to use exisitng layers with the trained weights

In [None]:
input2 = Input(shape=(1,)) # Only input one word at a time
x = embedding_layer(input2)
x, h, c = lstm(x, [h_i, c_i]) # now we need states. LSTM needs three inputs. The current input
                                                        # the previous cell and previous hidden state.here x is a single
                                                        # word index
output2 = dense(x)
sampling_model = Model([input2, h_i, c_i], [output2, h, c])
# h_i, c_i are initial hidden and cell state and  h, c are the next hidden and cell state.

In [None]:
# idx2word dictionary to get back words for sentences during prediction
idx2word = {v:k for k, v in word2idx.items()}

# Write a function to generate one line at a time

In [None]:
def generate_line():
  np_input = np.array([[ word2idx['<start>'] ]]) # The first input word is our input token
  h = np.zeros((1, hidden_size)) # h and c are intially zero which is consistent with our training
  c = np.zeros((1, hidden_size))

  # so we know when to quit
  end = word2idx['<end>']

  # store the output here
  output_sentence = []


  for ii in range(seq_len):
    o, h, c = sampling_model.predict([np_input, h, c], verbose = 'False')
    # o is the list or word probabilities for the next word and from where we are going to take a sample.
    # h and c are next hidden and cell states

    probs = o[0,0] # sample the first word.
    probs[0] = 0 # set the probabilities to zero if the first word is at zero index
    probs /= probs.sum() # normalize to make it valid prob distribution
    idx = np.random.choice(len(probs), p=probs) # sample the next word
    if idx == end: # if index is last word then break the loop
      break

    # Accumulate output. use word2idx mapping to append the word in our sentence
    # Convert the word index to a word string using idx2word and handle missing words
    word = idx2word.get(idx)
    if word is None:
        word = str(idx)  # If the word is not in idx2word, use the index as a string
    output_sentence.append(word)

    # make the next input into model
    np_input[0,0] = idx # make sure that np_input has the latest word

  return ' '.join(output_sentence)

# Generate New lines of Text

In [None]:
for jj in range(6):
    print(generate_line())

by your saplings. soul in is
it, living scatter chaste
my hand chases
young what blossoms,
1504

