<a href="https://colab.research.google.com/github/samsomsabu/NATURAL-LANGUAGE-PROCESSING/blob/main/2348549_LAB9poem_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [2]:
# Reading the text data file
data = open('/content/poem.txt', encoding="utf8").read()

# Plotting the WordCloud
data[:100]

'Stay, I said\nto the cut flowers.\nThey bowed\ntheir heads lower.\nStay, I said to the spider,\nwho fled.'

In [3]:
corpus = data.lower().split("\n")
corpus[:15]

['stay, i said',
 'to the cut flowers.',
 'they bowed',
 'their heads lower.',
 'stay, i said to the spider,',
 'who fled.',
 'stay, leaf.',
 'it reddened,',
 'embarrassed for me and itself.',
 'stay, i said to my body.',
 'it sat as a dog does,',
 'obedient for a moment,',
 'soon starting to tremble.',
 'stay, to the earth',
 'of riverine valley meadows,']

In [4]:
len(corpus)

2478

In [5]:
# Fitting the Tokenizer on the Corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

# Vocabulary count of the corpus
total_words = len(tokenizer.word_index)

print("Total Words:", total_words)

Total Words: 3807


In [6]:
tokenizer.word_index['i']

3

In [7]:

# Converting the text into embeddings
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,
                                         maxlen=max_sequence_len,
                                         padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = ku.to_categorical(label, num_classes=total_words+1)

In [8]:
input_sequences

array([[   0,    0,    0, ...,    0,  266,    3],
       [   0,    0,    0, ...,  266,    3,   59],
       [   0,    0,    0, ...,    0,    4,    1],
       ...,
       [   0,    0,    0, ..., 3807,   15,    4],
       [   0,    0,    0, ...,   15,    4,  203],
       [   0,    0,    0, ...,    4,  203,   85]], dtype=int32)

In [9]:
# Building a Bi-Directional LSTM Model
model = Sequential()
model.add(Embedding(total_words+1, 100,
                    input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words+1/2, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words+1, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 100)           380800    
                                                                 
 bidirectional (Bidirection  (None, 15, 300)           301200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 15, 300)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 3807)              384507    
                                                                 
 dense_1 (Dense)             (None, 3808)              14500864  
                                                        

In [11]:
history = model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
seed_text = "The world"
next_words = 25
ouptut_text = ""

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1,
      padding='pre')
    predicted = np.argmax(model.predict(token_list,
                                        verbose=0), axis=-1)
    output_word = ""

    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

    seed_text += " " + output_word

print(seed_text)

The world of the heart of the heart of the heart of the heart of the heart of the heart of the heart of the heart of
