In [None]:
### import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf 
import string 
import re 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,LSTM ,Embedding
from keras.models import Sequential
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
import pickle



In [None]:
## load data 
def load_doc(filename):
    with open(filename,'r') as f:
        text=f.read()
    return(text)    

In [None]:
## clean data
def clean(text):
    text=text.replace('--',' ')
    tokens=text.split()
    re_punc=re.compile('[%s]'% re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[w for w in tokens if w.isalpha()]
    tokens=[w.lower() for w in tokens ]
    return tokens

In [None]:
## save data 
def save_doc(lines,filename):
    data='\n'.join(lines)
    with open(filename,'w') as f:
        f.write(data)

In [None]:
##load data
data=load_doc('republic_clean.txt')
data[:200]

In [None]:
#clean data
tokens=clean(data)
tokens[:10]

In [None]:
print('Tolat Tokens %d'%len(tokens))
print('Uniqe Tokens %d' %len(set(tokens)))

In [None]:
length =50+1  ## 50 for input ,1 for output 
sequences=[]
for i in range(length,len(tokens)):
    seq=tokens[i-length :i]
    ## convert to line
    line=' '.join(seq)
    sequences.append(line)
    

In [None]:
print(sequences[0])
print('Total number of sequences %d' %len(sequences))

In [None]:
## save data
save_doc(sequences,'republic_sequences.txt')

In [None]:
## define model
def model(vocab_size,sequence_length):
    model=Sequential()
    model.add(Embedding(vocab_size,50,input_length=sequence_length))
    model.add(LSTM(100,return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100,activation='relu'))
    model.add(Dense(vocab_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    plot_model(model,to_file='model.png',show_shapes=True)
    return model

In [None]:
## tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sequences)
encoded_sequences=tokenizer.texts_to_sequences(sequences)

In [None]:
vocab_size=len(tokenizer.word_index) +1
vocab_size

In [None]:
input_data=np.array(encoded_sequences)
x,y=input_data[:,:-1],input_data[:,-1]
y=to_categorical(y,num_classes=vocab_size)
x.shape

In [None]:
model=model(vocab_size,x.shape[1])

In [None]:
plot_model(model,show_shapes=True)

In [None]:
model.fit(x,y,batch_size=128,epochs=100)

In [None]:
## save model
model.save('model.h5')

In [None]:
## save tokenizer
with open('tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)
    

In [None]:
## def generate 
def generate_text(model,tokenizer,seq_length,seed_text,n_words):
  result=[]
  in_text=seed_text
  for _ in range(n_words):
    encoded_text=tokenizer.texts_to_sequences([in_text])[0]
    padded_text=pad_sequences([encoded_text],maxlen=seq_length, padding='pre', truncating='pre')
    prediction_word=model.predict_classes(padded_text,verbose=0)
    output_word=''
    for w,i in tokenizer.word_index.items():
      if prediction_word[0]== i:
        output_word=w
        break
    in_text +=' '+output_word
    result.append(output_word)
  return ' '.join(result)      



In [None]:
from keras.models import load_model
from random import randint
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
model = load_model('model.h5')
# load the tokenizer
tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

In [None]:
from keras.preprocessing.sequence import pad_sequences
generated = generate_text(model, tokenizer, seq_length, seed_text, 50)
print(generated)