In [1]:
import numpy as np
import pandas as pd

In [2]:
from tensorflow import keras

In [15]:
import tensorflow

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer #tokenization
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences #padding~equallingText
from keras.models import Sequential

In [19]:
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# data ingestion

In [20]:
plays=pd.read_csv("https://raw.githubusercontent.com/densaflorativa/shakespeare/refs/heads/master/Shakespeare_data.csv")

In [21]:
plays.head(10)

Unnamed: 0,Data-line,Play,Player Line number,Act-Scene-Line,Player,Player-Line
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


In [23]:
plays.shape

(111396, 6)

In [26]:
plays.columns


Index(['Data-line', 'Play', 'Player Line number', 'Act-Scene-Line', 'Player',
       'Player-Line'],
      dtype='object')

In [28]:
plays=plays.drop(columns=['Data-line', 'Play', 'Player Line number', 'Act-Scene-Line', 'Player'])

In [None]:
#taking random 100 lines of playerline column and creating sigle string, then
# tokenization and model training

In [30]:
plays.head(3)

Unnamed: 0,Player-Line
0,ACT I
1,SCENE I. London. The palace.
2,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."


In [31]:
sample_lines=plays['Player-Line'].sample(n=100,random_state=42)

In [32]:
sample_lines

Unnamed: 0,Player-Line
49287,"That hath deprived me of your grace and favour,"
76725,"Their bodies, even to loathing, for they so st..."
46485,Men at some time are masters of their fates:
76934,"Disgorges such a tempest forth,"
35083,"That monster, custom, who all sense doth eat,"
...,...
40598,Proceed.
95234,SCENE VI. The same. A banqueting-room in Timon...
21982,Will kneel to him with thanks.
30834,"And a demand who is't shall die, I'd say"


In [33]:
data=" ".join(sample_lines)

In [34]:
data

"That hath deprived me of your grace and favour, Their bodies, even to loathing, for they so stunk, Men at some time are masters of their fates: Disgorges such a tempest forth, That monster, custom, who all sense doth eat, To this chair bind him. Villain, thou shalt find-- Dexterity so obeying appetite Hector, in view of Trojans and of Greeks, Who know the world, see heaven, but, feeling woe, I should my tears let fall upon your cheek, Let not that doctor e'er come near my house: Arise, and say how thou camest here. Have I not heard these islanders shout out and one thing more, that you be never so hardy to And say I am Revenge, sent from below To furnish me upon my longing journey. All his revenue. And thus the native hue of resolution Why, art thou mad, old fellow? porringer fell off her head, for kindling such a That you shall stifle in your own report Is not this suit of mine, that thou declare It is as easy to count atomies as to resolve the But let this same be presently perform'

# Tokenization

In [35]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
word_index=tokenizer.word_index



In [36]:
total_word=len(word_index)+1

# Creating Sequences

In [38]:
input_sequences=[]
token_list=tokenizer.texts_to_sequences([data])[0]

In [40]:
for i in range(1,len(token_list)):
  n_gram_sequence=token_list[:i+1]
  input_sequences.append(n_gram_sequence)
  #list of tokens (in vecs (can say))

In [42]:
max_sequence_len = max([len(seq) for seq in input_sequences])
#get the max lengthened one length
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [43]:
input_sequences

array([[  0,   0,   0, ...,   0,   4,  61],
       [  0,   0,   0, ...,   4,  61, 102],
       [  0,   0,   0, ...,  61, 102,  13],
       ...,
       [  0,   0,   4, ...,   5,  45,   6],
       [  0,   4,  61, ...,  45,   6, 386],
       [  4,  61, 102, ...,   6, 386,  64]], dtype=int32)

Splitting Data into Input and Output

In [45]:
X,y = input_sequences[:,:-1],input_sequences[:,-1]
y=to_categorical(y,num_classes=total_word)

In [46]:
print(X)

[[  0   0   0 ...   0   0   4]
 [  0   0   0 ...   0   4  61]
 [  0   0   0 ...   4  61 102]
 ...
 [  0   0   4 ...  17   5  45]
 [  0   4  61 ...   5  45   6]
 [  4  61 102 ...  45   6 386]]


In [48]:
print(y) #one hot encoded

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


# Model Building

In [50]:
model=Sequential()
model.add(Embedding(total_word,1000,input_length=max_sequence_len-1))
# creates an embedding layer that maps each word index to a 1000-dimensional
model.add(SimpleRNN(200))
# 200 neurons
model.add(Dense(total_word,activation='softmax'))



In [51]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [54]:
model.fit(X, y, epochs=15, batch_size=64, verbose=1)

Epoch 1/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 119ms/step - accuracy: 0.7109 - loss: 3.4958
Epoch 2/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 115ms/step - accuracy: 0.8243 - loss: 2.9936
Epoch 3/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.8675 - loss: 2.5753
Epoch 4/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.9265 - loss: 2.1728
Epoch 5/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 117ms/step - accuracy: 0.9617 - loss: 1.7128
Epoch 6/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 117ms/step - accuracy: 0.9866 - loss: 1.3525
Epoch 7/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - accuracy: 0.9972 - loss: 1.0214
Epoch 8/15
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.9997 - loss: 0.7813
Epoch 9/15
[1m12/12[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7dd7e5a5ca10>

In [55]:
def predict_next_word(model,tokenizer,text,len):
  token_list=tokenizer.texts_to_sequences([text])[0]
  token_list=pad_sequences([token_list],maxlen=len-1,padding='pre')

  predicted=model.predict(token_list,verbose=0)
  predicted_word_index=np.argmax([predicted])
  for word, index in tokenizer.word_index.items():
      if index == predicted_word_index:
          return word
  return None

In [56]:
text="That hath deprived me of your"

In [57]:
predict_next_word(model,tokenizer,text,max_sequence_len)

'grace'

In [60]:
text2="Disgorges such a tempest for"

In [62]:
predict_next_word(model,tokenizer,text2,max_sequence_len)

'that'

In [63]:
# A very simple model ,ngl simple ann could do it too,nothign special ,but lets see
#more projects of it, we could add ,pre layers,dropoutlayer, callbacks :earlystop ,checkpoints, normalization,batchnormalization