# Building a lyrics GPT 
 

In [1]:
# Check for GPU
!nvidia-smi -L

GPU 0: Quadro M4000 (UUID: GPU-a6fd9309-8a85-fb51-3725-dfa45cb9c348)


In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
#!pip install transformers

## Get data


In [8]:
# we will use all lyrics from Romanian hip-hop band B.U.G. Mafia, Poems by Eminescu and Bacovia
!wget https://raw.githubusercontent.com/psilly-billy/Lyrics_GPT/main/lyrics_dataset.txt

--2023-03-27 11:48:55--  https://raw.githubusercontent.com/psilly-billy/Lyrics_GPT/main/lyrics_dataset.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 915130 (894K) [text/plain]
Saving to: ‘lyrics_dataset.txt’


2023-03-27 11:48:55 (25.3 MB/s) - ‘lyrics_dataset.txt’ saved [915130/915130]



## Inspect data

In [9]:
with open ('lyrics_dataset.txt', 'r', encoding = 'utf-8') as f:
  text = f.read()

  #read data and inspect, 'r' - for reading, create a dataset 'text' from the file 

In [10]:
print ("Character Length of dataset: ", len(text))

Character Length of dataset:  878926


In [11]:
# look at first 1000 characters
print(text[:1000])

 B. U. G. Mafia - Şi cui ii pasă

Caddillac:
Ai un Mercedes de moare lumea, dar n-ai clasă ca şi el,
Dacă te-ar vedea patronii, ar renega acest model..
Vorbeşti la Vertu, cu vreo somitate, la pertú,
Tare, să vadă lumea cine eşti tu..
Dar cine eşti tu? N-ai nimic de arătat,
Doar ambalaj strălucitor s-ascundă mult căcat..
Adevărat, e tare p***a ta,
Vreo trei ar da la ea, fiindcă restu' au dat deja..
Şi, deşi n-are nimic în cap, se vrea a fi vedetă,
Tu bagi banii ca-n depozit că vrea etichetă,
E plină de silicon, din subsol până-n balcon,
Ai grijă, nu umbla cu acu' pe lângă balon..
Şi nu uita să cotizezi la băieţi, în cluburi,
Că s-ar putea ca alţi băieţi să te scoată-n şuturi..
Acum, n-o lua personal, am dat doar un exemplu,
Oricum, sunt destui lingăi care să-ţi facă templu, nu?.

Tataee:
Eşti o prinţesă.. Şi cui îi pasă?
Te dai în presă.. Şi cui îi pasă?
Că eşti la modă.. Şi cui îi pasă?
Hai, dă-te dracu', că nu ne pasă..
Băiatu' tatii.. Şi cui îi pasă?
Arunci cu banii.. Şi cui îi pasă?

In [12]:
# How many unique characters we have in this dataset?
chars = sorted(list(set(text))) # call a 'set' of all the characters that are in this dataset, make a 'list' out of it and after sort that

vocab_char = len(chars) #how many unique characters the model can see and use

print(chars)
print("".join(chars))
print (vocab_char)

['\t', '\n', ' ', '!', '"', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '¬', '³', 'Â', 'Î', 'Ñ', 'à', 'â', 'í', 'î', 'ï', 'ö', 'ú', 'Ă', 'ă', 'Ŕ', 'Ş', 'ş', 'Ţ', 'ţ', 'Ș', 'ș', 'ț', '—', '’', '“', '”', '„', '‟', '…']
	
 !"%&'()*,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz~¬³ÂÎÑàâíîïöúĂăŔŞşŢţȘșț—’“”„‟…
115


### Removing some characters and replacing some words 
The data has some words that are censored and some other characters thata we want to take out 

In [13]:

# Specify the words you want to replace
words_to_replace = {"Ñ":"N","í":"i","ï":"i","ú":"u","p***a":"pizda",
                    "p**a":"pula", "p*z*a":"pizda",
                    "ga*aza":"gaoaza", "m**e":"muie",
                    "c***t":"cacat", "p***":"pula", "f*ă":"futa",
                    "sl***z":"sloboz", "f*teti":"futeti", "c***t":"cacat", "cuprins":" ",
                    "George":" ", "Bacovia":" ", "Plumb":" "
                    }

for old_word,new_word in words_to_replace.items():
    text= text.replace(old_word,new_word)

In [14]:
# Let's clean aout dataset of unwanted characters
import re
chars_to_remove = "[%&=³‟*\\^|~£§©¬—■_@+/$`()“”„;...,?‘’…]"  # Specify the characters we want to remove


# Create a translation table
trans_table = text.maketrans("", "", chars_to_remove)


# Use the translate method to remove the characters
cleaned_text = text.translate(trans_table)

This code uses the `string.maketrans()` method to create a translation table that maps each character in the characters_to_remove string to None.
Then, it uses the `str.translate()` method to remove the unwanted characters from the original string by applying the translation table.

In [15]:
chars = sorted(list(set(cleaned_text))) # call a 'set' of all the characters that are in this dataset, make a 'list' out of it and after sort that

vocab_char = len(chars) #how many unique characters the model can see and use

print(chars)
print("".join(chars))
print (vocab_char)

['\t', '\n', ' ', '!', '"', "'", '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Â', 'Î', 'à', 'â', 'î', 'ö', 'Ă', 'ă', 'Ŕ', 'Ş', 'ş', 'Ţ', 'ţ', 'Ș', 'ș', 'ț']
	
 !"'-0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎàâîöĂăŔŞşŢţȘșț
86


## Creating a new file with cleaned text

In [16]:
file_name = "cleaned_text.txt"

# Open the file in write mode
with open(file_name, "w") as f:
    # Write the string to the file
    f.write(cleaned_text)

In [17]:
print(cleaned_text[:1000])

 B U G Mafia - Şi cui ii pasă

Caddillac:
Ai un Mercedes de moare lumea dar n-ai clasă ca şi el
Dacă te-ar vedea patronii ar renega acest model
Vorbeşti la Vertu cu vreo somitate la pertu
Tare să vadă lumea cine eşti tu
Dar cine eşti tu N-ai nimic de arătat
Doar ambalaj strălucitor s-ascundă mult căcat
Adevărat e tare pizda ta
Vreo trei ar da la ea fiindcă restu' au dat deja
Şi deşi n-are nimic în cap se vrea a fi vedetă
Tu bagi banii ca-n depozit că vrea etichetă
E plină de silicon din subsol până-n balcon
Ai grijă nu umbla cu acu' pe lângă balon
Şi nu uita să cotizezi la băieţi în cluburi
Că s-ar putea ca alţi băieţi să te scoată-n şuturi
Acum n-o lua personal am dat doar un exemplu
Oricum sunt destui lingăi care să-ţi facă templu nu

Tataee:
Eşti o prinţesă Şi cui îi pasă
Te dai în presă Şi cui îi pasă
Că eşti la modă Şi cui îi pasă
Hai dă-te dracu' că nu ne pasă
Băiatu' tatii Şi cui îi pasă
Arunci cu banii Şi cui îi pasă
Eşti un spectacol Şi cui îi pasă
Hai dă-te dracu' că nu ne pa

# Define Transformer Architecture

In [11]:
import tensorflow as tf

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % num_heads == 0

        self.depth = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)

        return output

    @staticmethod
    def scaled_dot_product_attention(q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)

        return output, attention_weights


class FeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super(FeedForwardNetwork, self).__init__()

        self.fc1 = tf.keras.layers.Dense(dff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)

        return x


class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        attn_output = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

# Usage example
d_model = 128
num_heads = 8
dff = 512
rate = 0.1

sample_transformer_block = TransformerBlock(d_model, num_heads, dff, rate)

input_shape = (None, d_model)
sample_input = tf.keras.Input(shape=input_shape)
sample_output = sample_transformer_block(sample_input, training=False, mask=None)

sample_transformer_model = tf.keras.Model(inputs=sample_input, outputs=sample_output)
print(sample_transformer_model.summary())



Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None, 128)]       0         
                                                                 
 transformer_block_1 (Transf  (None, None, 128)        198272    
 ormerBlock)                                                     
                                                                 
Total params: 198,272
Trainable params: 198,272
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load and preprocess your dataset
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().splitlines()
    return text

file_path = "cleaned_text.txt"
text_data = load_and_preprocess_data(file_path)

# Tokenize your dataset
vocab_size = 20077
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post')

input_data = input_sequences[:, :-1]
labels = input_sequences[:, 1:]
#labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Build the model using the custom TransformerBlock
d_model = 512
num_heads = 16
dff = 512
rate = 0.1

inputs = tf.keras.layers.Input(shape=(max_sequence_len - 1,))
embedding = tf.keras.layers.Embedding(total_words, d_model)(inputs)
transformer_block = TransformerBlock(d_model, num_heads, dff, rate)(embedding)
dropout = tf.keras.layers.Dropout(rate)(transformer_block)
outputs = tf.keras.layers.Dense(total_words, activation='softmax')(dropout)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

# Custom loss function
def masked_categorical_crossentropy(y_true, y_pred):
    mask = tf.cast(y_true[:, :, -1], tf.bool)
    y_true = y_true[:, :, :-1]

    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=False)
    mask = tf.cast(mask, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# Compile and train the model
model.compile(loss=masked_categorical_crossentropy, optimizer='adam', metrics=['accuracy'])





def data_generator(input_data, labels, batch_size, total_words):
    data_len = len(input_data)
    num_batches = data_len // batch_size

    while True:
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            x_batch = input_data[start:end]
            y_batch = labels[start:end]
            y_batch = tf.keras.utils.to_categorical(y_batch, num_classes=total_words)

            # Create a mask for padding
            mask = (y_batch != 0).any(axis=-1).astype(float)

            # Append the mask to y_batch as the last element
            y_batch = np.concatenate((y_batch, np.expand_dims(mask, -1)), axis=-1)

            yield (x_batch, y_batch)

from sklearn.model_selection import train_test_split

input_data_train, input_data_val, labels_train, labels_val = train_test_split(
    input_data, labels, test_size=0.1, random_state=42
)


# Create a generator with a batch size
batch_size = 128
train_generator = data_generator(input_data_train, labels_train, batch_size, total_words)
val_generator = data_generator(input_data_val, labels_val, batch_size, total_words)

# Compute the number of steps per epoch
steps_per_epoch = len(input_data) // batch_size
val_steps_per_epoch = len(input_data_val) // batch_size

# Train the model using the generator
#history = model.fit(train_generator, epochs=2, steps_per_epoch=steps_per_epoch, verbose=1)

history = model.fit(
    train_generator,
    epochs=5,
    steps_per_epoch=steps_per_epoch,
    verbose=1,
    validation_data=val_generator,
    validation_steps=val_steps_per_epoch,
)





Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 22)]              0         
                                                                 
 embedding (Embedding)       (None, 22, 512)           10276352  
                                                                 
 transformer_block_2 (Transf  (None, 22, 512)          1577984   
 ormerBlock)                                                     
                                                                 
 dropout_6 (Dropout)         (None, 22, 512)           0         
                                                                 
 dense_18 (Dense)            (None, 22, 20071)         10296423  
                                                                 
Total params: 22,150,759
Trainable params: 22,150,759
Non-trainable params: 0
_______________________________________________

In [13]:
# save the model
tf.saved_model.save(model, 'saved_model_5')



INFO:tensorflow:Assets written to: saved_model_5/assets


INFO:tensorflow:Assets written to: saved_model_5/assets


In [14]:
# create a checkpoint object and save the model
checkpoint = tf.train.Checkpoint(my_model=model)
checkpoint.save('v5_model.ckpt')

'v5_model.ckpt-1'

In [17]:
def generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature=0.1):
    output_text = seed_text
    
    for i in range(next_words):
        # Split the output text into words
        words = output_text.split()
        
        # Use the last 5 words as the seed text
        if len(words) >= 5:
            seed_text = " ".join(words[-5:])
        else:
            seed_text = " ".join(words)
        
        # Tokenize and pad the input sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Generate predictions
        predicted_probs = model.predict(token_list)[0][-1]

        # Apply temperature to the predicted probabilities
        exp_preds = np.exp(np.log(predicted_probs) / temperature)
        predicted_probs = exp_preds / np.sum(exp_preds)

        # Normalize the probabilities using softmax
        predicted_probs = K.softmax(predicted_probs).numpy()

        # Select a word index based on the probability distribution
        #predicted = np.random.choice(range(total_words), p=predicted_probs)
        predicted = np.argmax(predicted_probs)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        # Append the generated word to the output text
        output_text += " " + output_word
        
        # Add a line break after every 6th word
        if (i+1) % 6 == 0:
            output_text += "\n"

    return output_text


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 20077
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

loaded_model = tf.saved_model.load('saved_model_4')

In [7]:

from tensorflow.python.saved_model import tag_constants

# Load the saved model
loaded_model = tf.saved_model.load('my_saved_model', tags=[tag_constants.SERVING])

# Get the input and output signatures
input_signature = list(loaded_model.signatures.keys())[0]
output_signature = loaded_model.signatures[input_signature].output_shapes

# Print the signatures
print(f"Input signature: {input_signature}")
print(f"Output signature: {output_signature}")



Input signature: serving_default
Output signature: {'dense_12': TensorShape([None, 126, 19038])}


In [8]:
import numpy as np
from tensorflow.keras import backend as K

def generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature=5):
    output_text = seed_text
    
    for i in range(next_words):
        # Split the output text into words
        words = output_text.split()
        
        # Use the last 5 words as the seed text
        if len(words) >= 5:
            seed_text = " ".join(words[-5:])
        else:
            seed_text = " ".join(words)
        
        # Tokenize and pad the input sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='post')
        token_list = tf.cast(token_list, dtype=tf.float32)
        
        #print("Token List:",token_list)

        # Generate predictions
        predicted_probs = model(token_list)[0][-1]
        
        #print("First predictions:", predicted_probs)
    
        # Apply temperature to the predicted probabilities
        exp_preds = np.exp(np.log(predicted_probs) / temperature)
        predicted_probs = exp_preds / np.sum(exp_preds)
        
        #print("Probabilities with temperature applied:", predicted_probs)
        
        # Normalize the probabilities using softmax
        predicted_probs = K.softmax(predicted_probs).numpy()
        
        #print("Normalized Predictions:", predicted_probs)
        
        # Select a word index based on the probability distribution
        predicted = np.random.choice(range(total_words), p=predicted_probs)
        
        

        
        #print("Final Prediction:", predicted)
        
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        # Append the generated word to the output text
        output_text += " " + output_word
        
        # Add a line break after every 6th word
        if (i+1) % 6 == 0:
            output_text += "\n"

    return output_text





In [10]:
file_path = "cleaned_text.txt"
# Load and preprocess your dataset
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().splitlines()
    return text
text_data = load_and_preprocess_data(file_path)

# Tokenize your dataset
vocab_size = 19038
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

seed_text = "de dimineata"
next_words = 5
max_sequence_len = 127
new_lyrics = generate_lyrics(seed_text, next_words, loaded_model, max_sequence_len)
print(new_lyrics)



ValueError: 'a' and 'p' must have same size

In [33]:
from tensorflow.keras import backend as K
def generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature=0.5):
    output_text = seed_text
    
    for _ in range(next_words):
        # Tokenize and pad the input sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Generate predictions
        predicted_probs = model.predict(token_list)[0][-1]
        #print(predicted_probs.shape)
        #print(predicted_probs)


        # Apply temperature to the predicted probabilities
        exp_preds = np.exp(np.log(predicted_probs) / temperature)
        predicted_probs = exp_preds / np.sum(exp_preds)

        #print(predicted_probs)

        # Normalize the probabilities using softmax
        predicted_probs = K.softmax(predicted_probs).numpy()

        # Select a word index based on the probability distribution
        predicted = np.random.choice(range(total_words), p=predicted_probs)
       # print(range(vocab_size))
        #print(predicted)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        # Append the generated word to the seed text
        seed_text += " " + output_word
        output_text += " " + output_word

    return output_text


seed_text = "de dimineata as merge sa fa ceva"
next_words = 15
#vocab_size = 19038
temperature = 10
new_lyrics = generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature)
print(new_lyrics)



de dimineata as merge sa fa ceva mântuitor chinul tipatul stranse nespus botosani focurile sii olace singur pitic ciudatele argintul stupul roma


In [None]:
tokenizer.word_index.items()

In [29]:
print(total_words)

20071


Continue training from checkpoint

In [30]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load and preprocess your dataset
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().splitlines()
    return text

file_path = "cleaned_text.txt"
text_data = load_and_preprocess_data(file_path)

# Tokenize your dataset
vocab_size = 20071
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post')

input_data = input_sequences[:, :-1]
labels = input_sequences[:, 1:]
#labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

# Build the model using the custom TransformerBlock
d_model = 256
num_heads = 16
dff = 512
rate = 0.1

inputs = tf.keras.layers.Input(shape=(max_sequence_len - 1,))
embedding = tf.keras.layers.Embedding(total_words, d_model)(inputs)
transformer_block = TransformerBlock(d_model, num_heads, dff, rate)(embedding)
dropout = tf.keras.layers.Dropout(rate)(transformer_block)
outputs = tf.keras.layers.Dense(total_words, activation='softmax')(dropout)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

# Custom loss function
def masked_categorical_crossentropy(y_true, y_pred):
    mask = tf.cast(y_true[:, :, -1], tf.bool)
    y_true = y_true[:, :, :-1]

    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=False)
    mask = tf.cast(mask, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# Compile and train the model
model.compile(loss=masked_categorical_crossentropy, optimizer='adam', metrics=['accuracy'])





def data_generator(input_data, labels, batch_size, total_words):
    data_len = len(input_data)
    num_batches = data_len // batch_size

    while True:
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            x_batch = input_data[start:end]
            y_batch = labels[start:end]
            y_batch = tf.keras.utils.to_categorical(y_batch, num_classes=total_words)

            # Create a mask for padding
            mask = (y_batch != 0).any(axis=-1).astype(float)

            # Append the mask to y_batch as the last element
            y_batch = np.concatenate((y_batch, np.expand_dims(mask, -1)), axis=-1)

            yield (x_batch, y_batch)

from sklearn.model_selection import train_test_split

input_data_train, input_data_val, labels_train, labels_val = train_test_split(
    input_data, labels, test_size=0.1, random_state=42
)


# Create a generator with a batch size
batch_size = 128
train_generator = data_generator(input_data_train, labels_train, batch_size, total_words)
val_generator = data_generator(input_data_val, labels_val, batch_size, total_words)

# Compute the number of steps per epoch
steps_per_epoch = len(input_data) // batch_size
val_steps_per_epoch = len(input_data_val) // batch_size






Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 22)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 22, 256)           5138176   
                                                                 
 transformer_block_8 (Transf  (None, 22, 256)          527104    
 ormerBlock)                                                     
                                                                 
 dropout_21 (Dropout)        (None, 22, 256)           0         
                                                                 
 dense_57 (Dense)            (None, 22, 20071)         5158247   
                                                                 
Total params: 10,823,527
Trainable params: 10,823,527
Non-trainable params: 0
_______________________________________________

In [34]:
# Create a checkpoint object and optimizer
checkpoint = tf.train.Checkpoint(my_model=model, optimizer=tf.keras.optimizers.Adam())

# Restore the checkpoint using the prefix
checkpoint_prefix = 'v5_model.ckpt-1'
checkpoint.restore(checkpoint_prefix)

# Train the model for another 5 epochs
history2 = model.fit(
    train_generator,
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    verbose=1,
    validation_data=val_generator,
    validation_steps=val_steps_per_epoch,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# save the model
tf.saved_model.save(model, 'saved_model_6')



INFO:tensorflow:Assets written to: saved_model_6/assets


INFO:tensorflow:Assets written to: saved_model_6/assets


In [36]:
# create a checkpoint object and save the model
checkpoint = tf.train.Checkpoint(my_model=model)
checkpoint.save('v4_model.ckpt')

'v4_model.ckpt-1'

In [45]:
from tensorflow.keras import backend as K
def generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature=0.5):
    output_text = seed_text
    
    for _ in range(next_words):
        # Tokenize and pad the input sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Generate predictions
        predicted_probs = model.predict(token_list)[0][-1]
        #print(predicted_probs.shape)
        #print(predicted_probs)


        # Apply temperature to the predicted probabilities
        exp_preds = np.exp(np.log(predicted_probs) / temperature)
        predicted_probs = exp_preds / np.sum(exp_preds)

        #print(predicted_probs)

        # Normalize the probabilities using softmax
        predicted_probs = K.softmax(predicted_probs).numpy()

        # Select a word index based on the probability distribution
        predicted = np.random.choice(range(total_words), p=predicted_probs)
       # print(range(vocab_size))
        #print(predicted)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        # Append the generated word to the seed text
        seed_text += " " + output_word
        output_text += " " + output_word

    return output_text


seed_text = "tu cand te uitai la mine "
next_words = 10
#vocab_size = 19038
temperature = 1.2
new_lyrics = generate_lyrics(seed_text, next_words, model, max_sequence_len, temperature)
print(new_lyrics)


tu cand te uitai la mine  şchiopete catafalc distind minunato cameii crepusculare pact s'ajungi sunatzi răpit


In [58]:
# Create a checkpoint object and optimizer
checkpoint = tf.train.Checkpoint(my_model=model, optimizer=tf.keras.optimizers.Adam())

# Restore the checkpoint using the prefix
checkpoint_prefix = 'v3_model.ckpt-1'
checkpoint.restore(checkpoint_prefix)

# Train the model for another 5 epochs
history2 = model.fit(
    train_generator,
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    verbose=1,
    validation_data=val_generator,
    validation_steps=val_steps_per_epoch,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [60]:
# save the model
tf.saved_model.save(model, 'saved_model_4')



INFO:tensorflow:Assets written to: saved_model_4/assets


INFO:tensorflow:Assets written to: saved_model_4/assets
