In [24]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense , Flatten ,Embedding,Input,LSTM, BatchNormalization, Dropout, InputLayer, ReLU
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping

In [25]:
file = open("/kaggle/input/data-for-text-generation-nlp/GOT_TextData.txt", 'r')
data = file.read()
file.close()
data[0:1000]

'A Game Of Thrones \nBook One of A Song of Ice and Fire \nBy George R. R. Martin \nPROLOGUE \n"We should start back," Gared urged as the woods began to grow dark around them. "The wildlings are \ndead." \n"Do the dead frighten you?" Ser Waymar Royce asked with just the hint of a smile. \nGared did not rise to the bait. He was an old man, past fifty, and he had seen the lordlings come and go. \n"Dead is dead," he said. "We have no business with the dead." \n"Are they dead?" Royce asked softly. "What proof have we?" \n"Will saw them," Gared said. "If he says they are dead, that\'s proof enough for me." \nWill had known they would drag him into the quarrel sooner or later. He wished it had been later rather \nthan sooner. "My mother told me that dead men sing no songs," he put in. \n"My wet nurse said the same thing, Will," Royce replied. "Never believe anything you hear at a woman\'s \ntit. There are things to be learned even from the dead." His voice echoed, too loud in the twilit fores

In [26]:
def text_cleaning(text):
    temp = text
    temp = re.sub("[%s]" % re.escape(string.punctuation),'',temp)
    temp = [word for word in temp.split() if word.isalpha()]
    temp = [word.lower() for word in temp]
    text = " ".join(temp)
    return text

data = text_cleaning(data)
data = data[0:750000]
data[0:1000]

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a smile gared did not rise to the bait he was an old man past fifty and he had seen the lordlings come and go dead is dead he said we have no business with the dead are they dead royce asked softly what proof have we will saw them gared said if he says they are dead thats proof enough for me will had known they would drag him into the quarrel sooner or later he wished it had been later rather than sooner my mother told me that dead men sing no songs he put in my wet nurse said the same thing will royce replied never believe anything you hear at a womans tit there are things to be learned even from the dead his voice echoed too loud in the twilit forest page we have a long ride before us gared pointed out eight days maybe nine and night 

In [27]:
no_of_words = len(data.split())
print("Total number of words in the text = ", no_of_words)

no_of_vocab_words = len(set(data.split()))
print("Total number of words in the vocabulary of text = ", no_of_vocab_words)

print("Percentage of vocabulary words = ", (no_of_vocab_words*100)/no_of_words,"%")

Total number of words in the text =  145099
Total number of words in the vocabulary of text =  8779
Percentage of vocabulary words =  6.050351828751404 %


In [28]:
text_sequences = []
seq_length = 51
text_list = data.split()

for i in range(0,len(text_list)-seq_length+1):
    temp = text_list[i:i+seq_length]
    temp = " ".join(temp)
    text_sequences.append(temp)
    
len(text_sequences[-1].split())

51

In [29]:
text_sequences[0:5]

['a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint',
 'game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of',
 'of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a',
 'thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hi

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocab_size = len(tokenizer.word_index)+1
vocab_size

8780

In [31]:
import pickle
pickle.dump(tokenizer, open('tokenizer_text_generation.pkl', 'wb'))

In [32]:
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1] 
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
X.shape, y.shape

((145049, 50), (145049, 8780))

In [33]:
model = Sequential()
model.add(Embedding(vocab_size, seq_length, input_length=seq_length ))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
checkpoint = tensorflow.keras.callbacks.ModelCheckpoint("text_generation_model.h5",moniter='loss',save_best_only=True,mode='min')
RLR = tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='loss',patience=2,factor=0.001,min_delta=0.01)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            439000    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 8780)              886780    
Total params: 1,476,680
Trainable params: 1,476,680
Non-trainable params: 0
_________________________________________________________________


In [77]:
history = model.fit(X,y,batch_size=128, epochs=100, callbacks=[checkpoint,RLR])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
model.save('text_generation_model.h5')

In [86]:
def produce_output_text(model, tokenizer, seq_length, seed_text, n_words):
    final_words = list()
    in_text = seed_text
    for i in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat,axis=1)
        out_word = ''
        
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word = word
                break
        
        in_text = in_text + ' ' + out_word
        
        final_words.append(out_word)
    
    return " ".join(final_words)

In [87]:
seed_text = text_sequences[np.random.randint(0,len(text_sequences))]
print( seed_text + '\n' )
produce_output_text(model, tokenizer, seq_length, seed_text, n_words=50)

a feeble shake of his head too scared even to talk a burst of laughter filled the hall jon heard pyp squeaking in a high voice he stood lets go outside page the round fat face looked up at him suspicious why what will we do outside talk jon said have



'you seen a real hand and a candle to kings landing to the realm and without the dragon the first man remained to ben my lord the baby dreams of the lannisters he told you all i command whisperers i have to go quiet as a second to put it'

In [89]:
seed_text = text_sequences[np.random.randint(0,len(text_sequences))]
print( seed_text + '\n' )
produce_output_text(model, tokenizer, seq_length, seed_text, n_words=75)

the imp will no doubt swear the blade was lost or stolen while he was at winterfell and with his hireling dead who is there to give him the lie he tossed the knife lightly to ned my counsel is to drop that in the river and forget that it was



'not a man who suits to believe that you are fresher he did not ask me theres a lie ned said hotly you had the king and i did not ask the beast raged or eunuchs it is wrong to ask that you were going to listen the old man replied isnt why i shall be a man grown she said with a sudden affirmation of his head grenn sit at the door toward the'

***

'a game of thrones book one of a song of ice and fire by george r r martin prologue we should start back gared urged as the woods began to grow dark around them the wildlings are dead do the dead frighten you ser waymar royce asked with just the hint of a smile gared did not rise to the bait he was an old man past fifty and he had seen the lordlings come and go dead is dead he said we have no business with the dead are they dead royce asked softly what proof have we will saw them gared said if he says they are dead thats proof enough for me will had known they would drag him into the quarrel sooner or later he wished it had been later rather than sooner my mother told me that dead men sing no songs he put in my wet nurse said the same thing will royce replied never believe anything you hear at a womans tit there are things to be learned even from the dead his voice echoed too loud in the twilit forest page we have a long ride before us gared pointed out eight days maybe nine and night 

In [None]:
from tensorflow.keras.layers import TextVectorization


https://stackabuse.com/gpt-style-text-generation-in-python-with-tensorflowkeras/

***

# Using GPT

In [12]:
#get transformers
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
#get large GPT2 tokenizer and GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
GPT2 = TFGPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)
#view model parameters
GPT2.summary()

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgp_t2lm_head_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFGPT2MainLayer multiple                  774030080 
Total params: 774,030,080
Trainable params: 774,030,080
Non-trainable params: 0
_________________________________________________________________


In [20]:
input_text = "a feeble shake of his head too scared even to talk a burst of laughter filled the hall jon heard pyp squeaking in a high voice he stood lets go outside page the round fat face looked up at him suspicious why what will we do outside talk jon said have"
input_text

'a feeble shake of his head too scared even to talk a burst of laughter filled the hall jon heard pyp squeaking in a high voice he stood lets go outside page the round fat face looked up at him suspicious why what will we do outside talk jon said have'

In [23]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode(input_text, return_tensors='tf')

# # generate text until the output length (which includes the context length) reaches 50
# greedy_output = GPT2.generate(input_ids, max_length = 100)

# print("Output:\n" + 100 * '-')
# print(tokenizer.decode(greedy_output[0], skip_special_tokens = True))

In [22]:
# Top-K and Top-P Sampling

#combine both sampling techniques
sample_outputs = GPT2.generate(
                              input_ids,
                              do_sample = True, 
                              max_length = 2*100,                              #to test how long we can generate and it be coherent
                              #temperature = .7,
                              top_k = 50, 
                              top_p = 0.85, 
                              num_return_sequences = 5
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}...".format(i, tokenizer.decode(sample_output, skip_special_tokens = True)))
    print('')

Output:
----------------------------------------------------------------------------------------------------
0: a feeble shake of his head too scared even to talk a burst of laughter filled the hall jon heard pyp squeaking in a high voice he stood lets go outside page the round fat face looked up at him suspicious why what will we do outside talk jon said have you seen this guy? he stood still with his arms crossed he saw pyp jon said pyp. he has a big fat face and jon had never seen such a pretty face before. jon started to get worried pyp was coming in. the two stared at each other. pyp's eyes flashed with a dark purple light and he was in front of them. the fat face looked at him with big teeth jon was afraid to touch pyp or to see his face pyp was smiling and laughing as jon was getting ready to run out jon grabbed the door hanger and began to push it open. he felt like he was gonna explode in a million pieces. pyp was in front...

1: a feeble shake of his head too scared even to t