In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import re
from nltk.corpus import stopwords
import string

import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.optimizers import Adam

import pickle

# Load Data

In [None]:
def load_data(filename):
  # open the file as read only
  file = open(filename, 'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text
data = load_data(r"/kaggle/input/gameofthrones/got1.txt")
data = data[:1200000]

In [None]:
data[:1000]

# Prepare Corpus

In [None]:
def clean_text(text):
  sample = text
  sample = re.sub('[%s]' % re.escape(string.punctuation), '', sample)
  sample = [word for word in sample.split() if word.isalpha()]
  sample = [word.lower() for word in sample]
  sample = " ".join(sample)

  return sample

In [None]:
cleaned_data = clean_text(data)

In [None]:
cleaned_data[:100]

In [None]:
plt.bar(x = ["Total words", "Unique words"],
        height=[len(cleaned_data.split()), len(set(cleaned_data.split()))],
        color=sns.color_palette('pastel'))

In [None]:
print('Total Tokens: %d' % len(cleaned_data.split()))
print('Unique Tokens: %d' % len((set(cleaned_data.split()))))

In [None]:
sequence_doc = []
seq_len = 50
l = seq_len + 1
tokens = [w for w in cleaned_data.split()]

for i in range(l, len(tokens)):

    seq = tokens[i-l:i]

    line = ' '.join(seq)
    sequence_doc.append(line)

In [None]:
sequence_doc

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequence_doc)
sequences = tokenizer.texts_to_sequences(sequence_doc)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

seq_length = X.shape[1]

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X

In [None]:
y

# Perpare Model

In [None]:
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=seq_length))
    model.add(LSTM(200, return_sequences=True))
    model.add(LSTM(200))
    model.add(Dense(200, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    return model

In [None]:
model = define_model(vocab_size, seq_length)

In [None]:
model.fit(X, y, batch_size=128, epochs=10)

In [None]:
model.save('text_gen_model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer_text_gen.pkl', 'wb'))

# Generate Text Sequence

In [None]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
    # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat,axis=1)
        print(yhat)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [None]:
seed_text = sequence_doc[np.random.randint(0,len(sequence_doc))]
print(seed_text + '\n')
generate_seq(model, tokenizer, seq_length, seed_text, 50)

In [None]:
seed_text = sequence_doc[np.random.randint(0,len(sequence_doc))]
print(seed_text + '\n')
generate_seq(model, tokenizer, seq_length, seed_text, 50)[:60]