In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
import requests
import string 

In [3]:
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

In [4]:
data=response.text.split('\n')
data[0]

'This is the 100th Etext file presented by Project Gutenberg, and'

In [5]:
data=data[253:]

In [6]:
data[0],len(data)
for index,line in enumerate(data):
    if index==5:
        break
    print(line)

  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,


In [7]:
df=data.copy()

In [8]:
df=" ".join(data)


In [9]:
def clean_text(doc):
    tokens=doc.split()
    table=str.maketrans('','',string.punctuation)
    tokens=[w.translate(table) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    tokens=[word.lower() for word in tokens]
    return tokens

    

In [10]:
tokens=clean_text(df)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [11]:
print('total words',len(tokens))
print('Total unique words',len(set(tokens)))

total words 898199
Total unique words 27956


In [12]:
# how many words wwe going to use the words
length=50+1
lines=[]
for i in range(length,len(tokens)):
    seq=tokens[i-length:i]
    line=' '.join(seq)
    lines.append(line)
    if i>150000:
        break
print(len(lines))
        
    

149951


In [13]:
lines[0]

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [14]:
lines[1]

'fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self thy'

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(lines)
sequences=tokenizer.texts_to_sequences(lines)

In [17]:
sequences=np.array(sequences)

In [18]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# loading
with open('/kaggle/working/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [28]:
X,y=sequences[:,:-1],sequences[:,-1]

In [29]:
vocab_size=len(tokenizer.word_index)+1


In [30]:
# the vocab size is tokens or classes
y=to_categorical(y,num_classes=vocab_size)
X.shape[0]

149951

In [31]:
seq_length=X.shape[1]
print(seq_length)

50


In [32]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras import optimizers

In [33]:
model=Sequential()
model.add(Embedding(vocab_size,50,input_length=seq_length))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))

2022-12-09 14:31:26.176867: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-09 14:31:26.273849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-09 14:31:26.274701: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-09 14:31:26.276554: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            547100    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10942)             1105142   
Total params: 1,803,142
Trainable params: 1,803,142
Non-trainable params: 0
_________________________________________________________________


In [35]:
with tf.device('/device:GPU:0'):
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.fit(X,y,batch_size=256,epochs=1000)
model.save('final_model.h5')

2022-12-09 14:31:31.681404: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 6563055368 exceeds 10% of free system memory.
2022-12-09 14:31:40.679268: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 6563055368 exceeds 10% of free system memory.
2022-12-09 14:31:45.934320: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/1000


2022-12-09 14:31:49.106751: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


In [36]:
from tensorflow import keras
model_load = keras.models.load_model('/kaggle/working/final_model.h5')

In [37]:
def generate_text_seq(model,tokenizer,text_seq_length,seed_text,n_words):
    text=[]
    #n_words=how many words i need to generate
    
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([seed_text])[0]
        encoded=pad_sequences([encoded],maxlen=text_seq_length,truncating='pre')
        
        y_predict=model.predict(encoded)
        
        
        predicted_word=''
        
        for word ,index in tokenizer.word_index.items():
            if index==np.argmax(y_predict):
                predicted_word=word
                break
        seed_text=seed_text+ ' '+predicted_word
        text.append(predicted_word)
    return ' '.join(text)

In [38]:
seed_text = lines[14343]
print(seed_text)
generate_text_seq(model_load,tokenizer,seq_length,seed_text,100)

is black beautys successive heir and beauty slandered with a bastard shame for since each hand hath put on natures power fairing the foul with arts false borrowed face sweet beauty hath no name no holy bower but is profaned if not lives in disgrace therefore my mistress eyes are raven


'black her eyes so suited and they mourners seem at such who not born fair no weakness art thine spring were as who croppd as thou blessed praise in sorrow give and nor by where still is mad my lord that from thy colour thy soldiership so almost by rude barrenly show was love and sweetly doth reason and play not sweets may dot froms soon that i am eye thou lovst rivers in the world each fits but hard mortality spurn him and take his take to th fire of what time i bade thee for your leave like'

In [39]:
l2='hey what are you doing here'
generate_text_seq(model_load,tokenizer,100,l2,100)

'and thou mayst thou perceive o what she pays himself before the help that lies upon a oath and every man ins kingdom is bred to learn to have extreme abhorrd with feasting upon the nurses plebeians what he cannot come to be that fear let him ten quoth mine the fellow force touchstone now like our inventory o jove a mother are against his horse antony their bosom is wondrous hotter nothing to both rind her i shall do stretch stubbornness o tears exeunt servants which touchstone menenius go forth with us virgilia cannot not speak nothing the power'