In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical

MAX_WORDS_FOR_PREDICTION = 4
MAX_WORDS_TO_OUTPUT = 10

In [None]:
!rm *.txt
!wget 'https://raw.githubusercontent.com/r0ckYr/GenerativeAI/main/game_of_thrones.txt'
!wget 'https://raw.githubusercontent.com/r0ckYr/GenerativeAI/main/text.txt'
!cat text.txt game_of_thrones.txt >> input.txt

In [None]:
!du -sh *

In [None]:
with open('input.txt', 'r') as f:
  input = f.read()

In [None]:
input = input.split('\n')

In [None]:
import random

random.shuffle(input)

In [None]:
input[:1]

In [None]:
input = input[:100]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input)
# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))
vocab_length = len(tokenizer.word_index) + 1

In [None]:
X = []
y = []
max_sequence_len = 0
print("Unique words : ", vocab_length)
for name in input:
  if len(name) > 1:
    tokens = tokenizer.texts_to_sequences([name])[0]
    tokens_length = len(tokens)
    if tokens_length > max_sequence_len:
      max_sequence_len = tokens_length

    for j in range(0,tokens_length-2):
      for i in range(j+1, tokens_length):
        X.append(tokens[j:i])
        y.append(tokens[i])


for i in range(50):
  print(X[i], y[i])

In [None]:
X = np.array(pad_sequences(X, maxlen=max_sequence_len, padding='pre'))
print(X[50])

In [None]:
y = to_categorical(y, num_classes=vocab_length)
y[1]



In [None]:
print(max_sequence_len)

In [None]:
type(X)

In [None]:
#make model
model = Sequential()
model.add(Embedding(vocab_length, 100, input_length=max_sequence_len))
model.add(Bidirectional(LSTM(1000)))
model.add(Dense(vocab_length, activation='softmax'))

In [None]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

plot_model(model, to_file="plot.png", show_layer_names=True)

In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint_file = "word_names_model1.h5"
checkpoint = ModelCheckpoint(checkpoint_file, monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=4, callbacks=[checkpoint], verbose=1)

In [None]:
from keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('word_names_model1.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

In [None]:
def extract_last_words(input_string):
    words = input_string.split()
    if len(words) > 20:
        return ' '.join(words[-20:])
    else:
        return input_string

In [None]:
def predict_word(model, tokenizer, text):
  # text = extract_last_words(text)
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence, verbose=0))
  for key, value in tokenizer.word_index.items():
      if value == preds:
        predicted_word = key
  return predicted_word

In [None]:
text = "The morning had dawned"
MAX_WORDS_FOR_PREDICTION = 1000
print(text, end=' ')
for i in range(MAX_WORDS_FOR_PREDICTION):
  next_word = predict_word(model, tokenizer, text)
  text = text + " " + next_word
  if i % 30 == 0 and i!=0:
    print()
  print(next_word, end = ' ')