<a href="https://colab.research.google.com/github/psikrishna/shakespeare-shiptoasts/blob/master/shiptoasts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#importing keras etc for lstm
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
import keras.utils as ku 
#seeds
import tensorflow
tensorflow.random.set_seed(2)
from numpy.random import seed
seed(1)
import pandas as pd
import numpy as np
import string, os 

In [12]:
#dataset
curr_dir = '/content/sample_data/'
play_df = pd.read_csv('/content/drive/My Drive/colab/Shakespeare_data.csv')
all_lines = [h for h in play_df.PlayerLine]

In [13]:
#clean data
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 
corpus = [clean_text(x) for x in all_lines]
corpus[:10]

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant',
 'and breathe shortwinded accents of new broils',
 'to be commenced in strands afar remote',
 'no more the thirsty entrance of this soil',
 'shall daub her lips with her own childrens blood',
 'nor more shall trenching war channel her fields']

In [14]:
#tokenise
#tokens to help in skip-gram model
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    corpus = corpus[:7000]
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    #data->tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[523, 4],
 [142, 4],
 [142, 4, 339],
 [142, 4, 339, 1],
 [142, 4, 339, 1, 670],
 [53, 41],
 [53, 41, 84],
 [53, 41, 84, 29],
 [53, 41, 84, 29, 124],
 [53, 41, 84, 29, 124, 3]]

In [15]:
#padding
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
predictors.shape, label.shape

((45584, 33), (45584, 6543))

In [16]:
#design lstm
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    model.add(LSTM(512))
    model.add(Dropout(0.4))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model
model = create_model(max_sequence_len, total_words)

In [7]:
model.fit(predictors, label, epochs=20, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
 - 151s - loss: 6.8352
Epoch 2/20
 - 148s - loss: 6.5221
Epoch 3/20
 - 149s - loss: 6.3837
Epoch 4/20
 - 148s - loss: 6.2285
Epoch 5/20
 - 149s - loss: 6.0558
Epoch 6/20
 - 148s - loss: 5.8748
Epoch 7/20
 - 149s - loss: 5.6698
Epoch 8/20
 - 148s - loss: 5.4506
Epoch 9/20
 - 150s - loss: 5.1983
Epoch 10/20
 - 148s - loss: 4.9331
Epoch 11/20
 - 149s - loss: 4.6593
Epoch 12/20
 - 148s - loss: 4.4028
Epoch 13/20
 - 149s - loss: 4.1425
Epoch 14/20
 - 149s - loss: 3.9092
Epoch 15/20
 - 150s - loss: 3.7043
Epoch 16/20
 - 149s - loss: 3.5133
Epoch 17/20
 - 150s - loss: 3.3376
Epoch 18/20
 - 148s - loss: 3.1830
Epoch 19/20
 - 148s - loss: 3.0486
Epoch 20/20
 - 148s - loss: 2.9147


<keras.callbacks.callbacks.History at 0x7ff0902aab00>

In [8]:
model.fit(predictors, label, epochs=20, verbose=2)

Epoch 1/20
 - 148s - loss: 2.7958
Epoch 2/20
 - 149s - loss: 2.7060
Epoch 3/20
 - 149s - loss: 2.6049
Epoch 4/20
 - 149s - loss: 2.5147
Epoch 5/20
 - 149s - loss: 2.4369
Epoch 6/20
 - 149s - loss: 2.3659
Epoch 7/20
 - 149s - loss: 2.3064
Epoch 8/20
 - 149s - loss: 2.2434
Epoch 9/20
 - 149s - loss: 2.1793
Epoch 10/20
 - 149s - loss: 2.1275
Epoch 11/20
 - 149s - loss: 2.0730
Epoch 12/20
 - 153s - loss: 2.0416
Epoch 13/20
 - 153s - loss: 1.9993
Epoch 14/20
 - 151s - loss: 1.9656
Epoch 15/20
 - 150s - loss: 1.9234
Epoch 16/20
 - 149s - loss: 1.8971
Epoch 17/20
 - 149s - loss: 1.8655
Epoch 18/20
 - 149s - loss: 1.8286
Epoch 19/20
 - 148s - loss: 1.8062
Epoch 20/20
 - 148s - loss: 1.7798


<keras.callbacks.callbacks.History at 0x7ff0901b2a20>

In [9]:
model.fit(predictors, label, epochs=20, verbose=2)

Epoch 1/20
 - 149s - loss: 1.7710
Epoch 2/20
 - 150s - loss: 1.7517
Epoch 3/20
 - 149s - loss: 1.7114
Epoch 4/20
 - 150s - loss: 1.7035
Epoch 5/20
 - 150s - loss: 1.6779
Epoch 6/20
 - 149s - loss: 1.6584
Epoch 7/20
 - 150s - loss: 1.6546
Epoch 8/20
 - 150s - loss: 1.6471
Epoch 9/20
 - 149s - loss: 1.7130
Epoch 10/20
 - 150s - loss: 1.6254
Epoch 11/20
 - 149s - loss: 1.5871
Epoch 12/20
 - 149s - loss: 1.6056
Epoch 13/20
 - 149s - loss: 1.6345
Epoch 14/20
 - 149s - loss: 1.6507
Epoch 15/20
 - 149s - loss: 1.5732
Epoch 16/20
 - 149s - loss: 1.5643
Epoch 17/20
 - 149s - loss: 1.5871
Epoch 18/20
 - 149s - loss: 1.5730
Epoch 19/20
 - 149s - loss: 1.5572
Epoch 20/20
 - 149s - loss: 1.5906


<keras.callbacks.callbacks.History at 0x7ff09025a9b0>

In [10]:
model.fit(predictors, label, epochs=20, verbose=2)

Epoch 1/20
 - 149s - loss: 1.6425
Epoch 2/20
 - 149s - loss: 1.5973
Epoch 3/20


KeyboardInterrupt: ignored

In [17]:
#generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [18]:
print(generate_text("Julius", 20, model, max_sequence_len))

Julius Rouen Rouen Assistance Leg Repugn Hearkend Pick Hundred Hearkend Pick Hearkend Bawdyhouse Guise Sway Cowardly Resolutely Faults Faults Faults Faults


In [19]:
print(generate_text("Mercutio", 20, model, max_sequence_len))

Mercutio Blush Blush Conversing Haunts Conversing Haunts Herself Herself Lath Lath Bounds Bounds Stroke Outward Stonyhearted Outward Outward Heaviness Outward Outward


In [20]:
print(generate_text("Caesar", 20, model, max_sequence_len))

Caesar Sequestration Stir Sequestration Cuckold Stonyhearted Pranks Flamecoloured Parings Husband Thumb Happiness Haunts Wanted Stir Variest Injurious Assistance Solemnly Tale Fiend
