<a href="https://colab.research.google.com/github/shreyasbhojane10/Artificial-Intelligence/blob/main/LSTM_Shakespeareplays_Text_Gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install gensim



In [3]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,Input,LSTM,Embedding
from keras.callbacks import EarlyStopping

In [4]:
with open("/content/sample_data/alllines.txt","r") as file:
  df = file.read()
print(df[:500])

"ACT I"
"SCENE I. London. The palace."
"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others"
"So shaken as we are, so wan with care,"
"Find we a time for frighted peace to pant,"
"And breathe short-winded accents of new broils"
"To be commenced in strands afar remote."
"No more the thirsty entrance of this soil"
"Shall daub her lips with her own children's blood,"
"Nor more shall trenching war channel her fields,"
"Nor bruise her flowerets with the ar


In [5]:
type(df)

str

## Tokenize the text

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([df])
total_words = len(tokenizer.word_index) + 1
total_words

25576

In [7]:
index_word = {i: word for word, i in tokenizer.word_index.items()}
index_word

{1: 'the',
 2: 'and',
 3: 'i',
 4: 'to',
 5: 'of',
 6: 'a',
 7: 'you',
 8: 'my',
 9: 'in',
 10: 'that',
 11: 'is',
 12: 'not',
 13: 'me',
 14: 'it',
 15: 'with',
 16: 'for',
 17: 'be',
 18: 'his',
 19: 'your',
 20: 'this',
 21: 'he',
 22: 'but',
 23: 'have',
 24: 'as',
 25: 'thou',
 26: 'him',
 27: 'will',
 28: 'so',
 29: 'what',
 30: 'her',
 31: 'thy',
 32: 'all',
 33: 'no',
 34: 'do',
 35: 'by',
 36: 'shall',
 37: 'if',
 38: 'are',
 39: 'we',
 40: 'thee',
 41: 'our',
 42: 'on',
 43: 'now',
 44: 'good',
 45: 'lord',
 46: 'from',
 47: 'sir',
 48: 'at',
 49: 'come',
 50: 'they',
 51: 'enter',
 52: 'or',
 53: 'would',
 54: 'she',
 55: 'more',
 56: 'which',
 57: 'well',
 58: 'was',
 59: 'o',
 60: 'how',
 61: 'am',
 62: 'then',
 63: 'here',
 64: 'let',
 65: 'their',
 66: 'them',
 67: 'love',
 68: 'when',
 69: 'hath',
 70: 'than',
 71: 'man',
 72: 'there',
 73: 'like',
 74: 'one',
 75: "i'll",
 76: 'an',
 77: 'go',
 78: 'upon',
 79: 'know',
 80: 'us',
 81: 'say',
 82: 'may',
 83: 'make',
 8

# Prepare input sequences

In [8]:
input_sequences = []
for line in df.split('\n'):       #picking senteces line by line
    token_list = tokenizer.texts_to_sequences([line])[0]   #picking indexes
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)          #sentence in index form
print(input_sequences[:15])

[[315, 3], [135, 3], [135, 3, 802], [135, 3, 802, 1], [135, 3, 802, 1, 571], [51, 88], [51, 88, 338], [51, 88, 338, 45], [51, 88, 338, 45, 397], [51, 88, 338, 45, 397, 5], [51, 88, 338, 45, 397, 5, 1199], [51, 88, 338, 45, 397, 5, 1199, 1], [51, 88, 338, 45, 397, 5, 1199, 1, 879], [51, 88, 338, 45, 397, 5, 1199, 1, 879, 5], [51, 88, 338, 45, 397, 5, 1199, 1, 879, 5, 2367]]


picking senteces line by line in index form -
example :
            [315, 3] : ACT I
            [135, 3] : SCENE I

# Pad sequences to bring all the data to same length

In [9]:
max_sequence_length = max([len(x) for x in input_sequences]) # consider the maximum sequence length from the input sequences
max_sequence_length

167

167 is max len, so for 1st line 167-2=165(zero's will add)

In [10]:
input_sequences = sequence.pad_sequences(input_sequences,maxlen=max_sequence_length)
input_sequences[:10]

array([[  0,   0,   0, ...,   0, 315,   3],
       [  0,   0,   0, ...,   0, 135,   3],
       [  0,   0,   0, ..., 135,   3, 802],
       ...,
       [  0,   0,   0, ...,  88, 338,  45],
       [  0,   0,   0, ..., 338,  45, 397],
       [  0,   0,   0, ...,  45, 397,   5]], dtype=int32)

In [11]:
# Separate x and y features
X, y = input_sequences[:,:-1], input_sequences[:,-1]
print(y[0])
y = to_categorical(y, num_classes=total_words)

3


Here, we are taking out last word in 'y'. Because likewise our model
will understand next word.[last word -> next word, last word -> next word]

In [12]:
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#pip install gensim

In [13]:
# word embedding
import gensim.downloader as api

# Download pre-trained Word2Vec (takes some time)
word2vec_model = api.load("word2vec-google-news-300")  # 300-dim vectors

embedding_dim = 300
embedding_matrix = np.zeros((total_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]


 embedding matrix = {i:word}
 embedding matrix = {7:[0.9,0.8,0.7,0.56,.........,]}
Here basically we are fecting actual words by using indexes
3 = I
135 = scene

In [14]:
# Model Building
# Initalize the model
model = Sequential()
# Add the Input layer
model.add(Input(shape=(max_sequence_length,)))
# Add the Embedding layer
model.add(Embedding(input_dim=total_words,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    trainable=False))  # Can set True to fine-tune
# Add LSTM layer
model.add(LSTM(150, return_sequences=True))
# Add another LSTM layer | 150, 100 neurons/units
model.add(LSTM(100))
# Add one Hidden layer
model.add(Dense(100, activation='tanh'))
# Add output layer
model.add(Dense(total_words, activation='softmax'))

In [15]:
model.summary()

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=["accuracy"])

In [None]:
history = model.fit(X, y, epochs=15, batch_size=64)

In [None]:

nltk.download('punkt_tab')

In [None]:
def generate_text(user_text, next_words=50):
    for _ in range(next_words):
        # preparing our user text ready for the model
        token_list = [tokenizer.word_index.get(w, 0) for w in word_tokenize(user_text.lower())]
        token_list = sequence.pad_sequences([token_list],maxlen=max_sequence_length-1)
        # give the proceesed text to model for prediction of next 50words
        predicted_probs = model.predict(token_list.reshape(1, max_sequence_length-1), verbose=0)
        # [0.98,0.08,0.001,0.35]
        #np.max([0.98,0.08,0.001,0.35]) = 0.98
        #np.argmax([[0.98,0.08,0.001,0.35]]) = 1
        predicted = np.argmax(predicted_probs, axis=-1) # it gives you the index of next word
        output_word = index_word.get(predicted[0], '') # convert the index into word using index_word
        user_text += ' ' + output_word
        # user_text = user_text + output_word
    return user_text

In [None]:
print(generate_text("Enter KING HENRY"))