In [1]:
import spacy
import random
import re
nlp = spacy.load('en_core_web_md',disable = ['parser','tagger','ner'])

In [2]:
#removing the roman characters
# with open('shakespeare.txt','r') as f:
#     raw_text = f.read()

# with open('sheakesphere_noroman.txt','w') as f:
#     raw_text_without_roman_int = re.sub("[IVXLCDM]+",'\n',raw_text)
#     f.write(raw_text_without_roman_int)

In [3]:
# Reading in files as a string text
def read_file(filepath):
    with open(filepath) as f:
        text_data = f.read()
    return text_data

word_doc = read_file('sheakesphere_noroman.txt')

In [4]:
# Clean Text and Tokenize
def remove_punc(word_doc):
    tokens = [token.text.lower() for token in nlp(word_doc) if token.text not in '\n\n\n\n  \n\n\n  \n    \n\n  \n\n\n  \n    \n    \n\n\n\n  \n\n\n  \n\n  \n\n\n  \n"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']
    return tokens

tokens = remove_punc(word_doc)

In [5]:
tokens = tokens[56:]

In [6]:
# Create Sequences of Tokens
def token_sequence(tokens,train_len):
    sequences = []
    for i in range(train_len,len(tokens)):
        seq = tokens[i - train_len:i]
        sequences.append(seq)
    return sequences

train_len = 25 + 1   
text_sequences = token_sequence(tokens,train_len)

In [7]:
# Keras Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [8]:
# Convert to Numpy Matrix
import numpy as np
sequences = np.array(sequences)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.optimizers import Adam

In [10]:
# Creating an LSTM based model function
def create_model(vocabulary_size,seq_len):
    opt = Adam()
    
    model = Sequential()
    model.add(Embedding(vocabulary_size,25,input_length = seq_len))
    model.add(LSTM(150,return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150,activation='relu'))
    model.add(Dense(vocabulary_size,activation='softmax'))
    
    model.compile(optimizer=opt,loss='categorical_crossentropy',metrics=['accuracy'])
    model.summary()
    return model

In [11]:
# Train / Test Split
from tensorflow.keras.utils import to_categorical
X = sequences[:,:-1]
y = sequences[:,-1]

vocabulary_size = len(tokenizer.word_counts)
seq_len = X.shape[1]

y = to_categorical(y,num_classes=vocabulary_size+1)

In [12]:
#callbacks
from tensorflow.keras.callbacks import EarlyStopping,LearningRateScheduler,ModelCheckpoint
import tensorflow as tf

early_stoping = EarlyStopping(monitor='loss',patience=15,restore_best_weights=True,mode='min')


def scheduler(epoch, lr):
      if epoch < 220:
        return lr
      else:
        return lr * tf.math.exp(-0.1)

learning_rate_scheduler = LearningRateScheduler(scheduler)

model_checkpoint = ModelCheckpoint(filepath='./tmp/checkpoint',
                                            save_weights_only=True,
                                            monitor='loss',
                                            mode='min',
                                            save_best_only=True)
                    
callback_list = [early_stoping,learning_rate_scheduler,model_checkpoint]

In [13]:
# Train the model
model = create_model(vocabulary_size+1,seq_len)
# history = model.fit(X,y,callbacks=callback_list,epochs=300,batch_size=128)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            80100     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 150)           105600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense (Dense)                (None, 150)               22650     
_________________________________________________________________
dense_1 (Dense)              (None, 3204)              483804    
Total params: 872,754
Trainable params: 872,754
Non-trainable params: 0
_________________________________________________________________


In [18]:
#load model
from pickle import dump,load
from tensorflow.keras.models import load_model

model = load_model('epoch300colab.h5',compile=False)
tokenizer = load(open('tokenizer300','rb'))

In [19]:
### Generating New Text
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
        
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
                
        pred_word = tokenizer.index_word[pred_word_ind]
               
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [55]:
rand_ind = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[rand_ind]
seed_text = ' '.join(random_seed_text)

In [60]:
generated_text = generate_text(model,tokenizer,seq_len,seed_text,45)
generated_text

"arise you live in this and dwell in lovers ' eyes sweet longer part in my deeds breast but all this sort as thou particulars not shall mine eyes for eyes have done ine say thy praise and most idol show since wherein like her"

In [49]:
gen_text = '''feeling but by others ' seeing for thou art \n true ike for the swallow'd bait \n on purpose laid and rude oubting the tillage \n of thy brand for almost change \n my poverty and yet those cause yet few ' grew a prey'''

In [61]:
gen_text2 = '''arise you live in this and dwell in lovers \n ' eyes sweet longer part in my deeds \n breast but all this sort as thou particulars \n not shall mine eyes for eyes \n have done ine say thy praise and most idol show"'''

### End

In [50]:
print(gen_text)

feeling but by others ' seeing for thou art 
 true ike for the swallow'd bait 
 on purpose laid and rude oubting the tillage 
 of thy brand for almost change 
 my poverty and yet those cause yet few ' grew a prey


In [62]:
print(gen_text2)

arise you live in this and dwell in lovers 
 ' eyes sweet longer part in my deeds 
 breast but all this sort as thou particulars 
 not shall mine eyes for eyes 
 have done ine say thy praise and most idol show"
