In [None]:
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical, plot_model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import random

In [None]:
with open("sample1.txt", "r") as file:
    data = file.read()

In [None]:
def cleaning_data(data):
    data = re.sub("[\\u202c]", "", data)
    data = re.sub("[\\u202b]", "", data)
    data = re.sub("[\\u202a]", "", data)
    data = re.sub("[\\ufeff]", "", data)
    data = re.sub("[\\u200d]", "", data)
    data = re.sub("[\\uf089]", "", data)
    data = re.sub("[\\u200f]", "", data)
    data = re.sub("[\\u200b]", "", data)
    data = re.sub("[\\uf08b]", "", data)
    data = re.sub("[\\uf08c]", "", data)
    data = re.sub("[\d]", " ", data)
    data = re.sub("[\s]", " ", data)
    data = re.sub('[/(){}\[\]\|@,;!٪×،*ـ+؟؛"" ... .. . <> _ - :]', " ", data)
    data = re.sub('[!٬٫﷼٪×*)(ـ+}|؛؟<>‌ ÷؛«» "" - �]', " ", data)
    return data.split()

In [None]:
tokens = cleaning_data(data)
len(tokens)

In [None]:
print("total tokens: {}".format(len(tokens)))
print("unique tokens: {}".format(len(set(tokens))))

In [None]:
train_len = 25+1 # 50 training words , then one target word

text_sequences = []

for i in range(train_len, len(tokens)):
    
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [None]:
len(text_sequences)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
vocabulary_size = len(tokenizer.word_counts)
print("Vocabulary size: ", vocabulary_size)

In [None]:
sequences = np.array(sequences)

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    plot_model(model, to_file="model.png")
    
    return model

In [None]:
X = sequences[:,:-1]

In [None]:
y = sequences[:,-1]

In [None]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

In [None]:
model = create_model(vocabulary_size+1, seq_len)

In [None]:
checkpoint = ModelCheckpoint("model.h5", monitor = 'loss', save_best_only = True, mode = 'min') 
model.fit(X, y, batch_size=128, epochs=250,verbose=1, callbacks=[checkpoint])

In [None]:
model.save("final_model.h5")

In [None]:
final_model = load_model("final_model.h5")

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):

        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [None]:
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
generate_text(final_model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=10)