<a href="https://colab.research.google.com/github/ryzhokhina/Tensorflow-Dev-Exam-Preparation/blob/main/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow.keras.utils as ku
import numpy as np
import tensorflow as tf

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/irish-lyrics-eof.txt \
    -O irish-lyrics-eof.txt

--2023-10-05 17:00:16--  https://storage.googleapis.com/learning-datasets/irish-lyrics-eof.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 142.250.101.207, 2607:f8b0:4023:c06::cf, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68970 (67K) [text/plain]
Saving to: ‘irish-lyrics-eof.txt’


2023-10-05 17:00:16 (122 MB/s) - ‘irish-lyrics-eof.txt’ saved [68970/68970]



In [3]:
data = open(r"irish-lyrics-eof.txt").read()
corpus = data.lower().split("\n")


In [4]:
corpus[1]

'and you that are blooming in your prime'

In [5]:
len(corpus)

1693

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

total_words

2690

In [7]:
token_list = tokenizer.texts_to_sequences(corpus[1])

In [8]:
tokenizer.texts_to_sequences([corpus[1]])[0]

[2, 11, 15, 31, 361, 8, 24, 1218]

In [9]:
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

In [10]:
input_sequences[:10]

[[51, 12],
 [51, 12, 96],
 [51, 12, 96, 1217],
 [51, 12, 96, 1217, 48],
 [51, 12, 96, 1217, 48, 2],
 [51, 12, 96, 1217, 48, 2, 69],
 [2, 11],
 [2, 11, 15],
 [2, 11, 15, 31],
 [2, 11, 15, 31, 361]]

In [11]:
max_lenght = max([len(s) for s in input_sequences])
max_lenght


16

In [12]:
input_data = pad_sequences(input_sequences, maxlen=max_lenght, padding='pre')


In [13]:
len(input_data)

12038

In [14]:
seq, label = input_data[:,:-1], input_data[:,-1]
seq.shape, label.shape


((12038, 15), (12038,))

In [15]:
train_ds = tf.data.Dataset.from_tensor_slices((seq, label))
train_ds = train_ds.shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)



In [16]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_lenght-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss=SparseCategoricalCrossentropy() , optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 100)           269000    
                                                                 
 bidirectional (Bidirection  (None, 15, 300)           301200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 15, 300)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1345)              135845    
                                                                 
 dense_1 (Dense)             (None, 2690)              3620740   
                                                        

In [24]:
# create a model
def get_lstm_model(vocab_size, input_size = max_lenght-1, embedding_dim=128, rnn_units=256):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(input_size,), dtype=tf.int64),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
    return model

In [25]:
model = get_lstm_model(total_words)


In [27]:
history = model.fit(train_ds, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [57]:
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 10

prediction_words = []

x = tokenizer.texts_to_sequences([seed_text])[0]
x = pad_sequences([x], maxlen=max_lenght-1, padding='pre')

for i in range(next_words):
  predict = model.predict(x, verbose = 0)
  index_word = np.argmax(predict)
  word = tokenizer.index_word[index_word]
  prediction_words.append(word)
  x = np.append(x, index_word)
  x = x[1:]
  x = x.reshape(1,x.shape[0])


print(' '.join(prediction_words))







for erin go bragh to darlin jenny stand and see
