### Import TensorFlow and other libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

path_to_data = "/content/drive/MyDrive/DATA228/code/data/TextData"

Mounted at /content/drive


In [2]:
import tensorflow as tf

import numpy as np
import os
import time

In [3]:
# Access and read the file specifying the encoding explicitly
singer = str(input('Type in user name')) # getting singer name from user
file_name = singer+".txt"
path = path_to_data + "/" + file_name

try:
    with open(path, 'r', encoding='utf-8') as file:
        content = file.read()
        print(content)

except FileNotFoundError:
    print(f"The file {path} was not found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Type in user namecoldplay
['just because im losing doesnt mean im lost doesnt mean ill stop doesnt mean im across just because im hurting doesnt mean im hurt doesnt mean i didnt get what i deserved no better and no worse   i just got lost every river that i tried to cross every door i ever tried was locked oh and im just waiting til the shine wears off   you might be a big fish in a little pond doesnt mean youve won cause along may come a bigger one  and youll be lost every river that you tried to cross every gun you ever held went off oh and im just waiting til the firing stopped oh and im just waiting til the shine wears off geah just waiting til the yeah uhhuh i gotchu uh yeah   with the same sword they knight you they gon goodnight you with shit thats only half if they like you that aint even the half what they might do dont believe me ask michael see martin see malcolm see biggie see pac see success and its outcome see jesus see judas see caesar see brutus see success is like suic

#### Read the data
#### First, look in the text

In [4]:
# Read and decode for py2 compat.
text = open(path, 'rb').read().decode(encoding='utf-8')

# Number of characters in the text
print(f'Length of text: {len(text)} characters')

Length of text: 316436 characters


In [5]:
# Printing out the first 250 characthers present in the lyrical text
print(text[:250])

['just because im losing doesnt mean im lost doesnt mean ill stop doesnt mean im across just because im hurting doesnt mean im hurt doesnt mean i didnt get what i deserved no better and no worse   i just got lost every river that i tried to cross eve


In [6]:
# counting the unique chars
uniq_vocab = sorted(set(text))
print(f'{len(uniq_vocab)} unique characters')

42 unique characters


In [7]:
import string

def find_special_characters(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
            special_characters = set(content) - set(string.printable)
            if special_characters:
                print("Special characters found:")
                for char in special_characters:
                    print(f"{char} (Unicode: {ord(char)})")
            else:
                print("No special characters found.")
    except FileNotFoundError:
        print(f"The file {path} was not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

find_special_characters(path)

No special characters found.


Prepare the text for training by transforming it into a numerical format. To achieve this, utilize the tf.keras.layers.StringLookup layer, which is capable of assigning a numeric ID to each character. Prior to applying this layer, it is essential to break down the text into individual tokens.

Explanation:
Before initiating the training process, it is crucial to convert the textual data into a format suitable for numerical analysis. This transformation is facilitated by the tf.keras.layers.StringLookup layer, which can assign a unique numeric identifier to each character in the text. However, it is important to note that before applying this layer, the text needs to be segmented into tokens. Tokenization involves breaking down the text into smaller units, such as individual words or characters, to facilitate more effective numerical representation. Once tokenized, the StringLookup layer can be employed to convert these tokens into numeric IDs, enabling the subsequent training of the model on a numerical dataset.

This layer is frequently employed in natural language processing applications, particularly when there is a requirement to transform text data into a format compatible with neural networks. It achieves this conversion by representing the textual information using integers instead of raw text.

In [8]:
char_id = tf.keras.layers.StringLookup(
    vocabulary=list(uniq_vocab), mask_token=None)

In [9]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=char_id.get_vocabulary(), invert=True, mask_token=None)

In [10]:
# NOW ORIGINAL TEXT
all_id = char_id(tf.strings.unicode_split(text, 'UTF-8'))
all_id

<tf.Tensor: shape=(316436,), dtype=int64, numpy=array([14,  3, 26, ..., 36,  3, 16])>

In [11]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_id)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

[
'
j
u
s
t
 
b
e
c


In [12]:
#  It converts each batch of ID sequences into characters and prints the result for the first batch.
# The use of batches is common in machine learning, where models are often trained on sequences of data rather than individual data points.
seq_length = 50
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'[' b"'" b'j' b'u' b's' b't' b' ' b'b' b'e' b'c' b'a' b'u' b's' b'e'
 b' ' b'i' b'm' b' ' b'l' b'o' b's' b'i' b'n' b'g' b' ' b'd' b'o' b'e'
 b's' b'n' b't' b' ' b'm' b'e' b'a' b'n' b' ' b'i' b'm' b' ' b'l' b'o'
 b's' b't' b' ' b'd' b'o' b'e' b's' b'n' b't'], shape=(51,), dtype=string)


In [13]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b"['just because im losing doesnt mean im lost doesnt"
b' mean ill stop doesnt mean im across just because i'
b'm hurting doesnt mean im hurt doesnt mean i didnt g'
b'et what i deserved no better and no worse   i just '
b'got lost every river that i tried to cross every do'


In [14]:
# This is a common pattern when working with sequential data, like in training a model
# to predict the next element in a sequence.
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [15]:
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"['just because im losing doesnt mean im lost doesn"
Target: b"'just because im losing doesnt mean im lost doesnt"



Generate training batches to optimize performance by shuffling, batching, and prefetching the dataset for training.

In [16]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 50), dtype=tf.int64, name=None), TensorSpec(shape=(64, 50), dtype=tf.int64, name=None))>

In [17]:
# Length of the vocabulary in StringLookup Layer
size_of_vocabulary = len(char_id.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

This model comprises three layers:

1. **tf.keras.layers.Embedding:** Serving as the input layer, it involves a trainable lookup table. This table maps each character-ID to a vector with embedding_dim dimensions.

2. **tf.keras.layers.GRU:** This layer represents a type of Recurrent Neural Network (RNN) with a size of units=rnn_units. Alternatively, an LSTM layer can be employed in this position.

3. **tf.keras.layers.Dense:** Functioning as the output layer, it consists of vocab_size outputs. Each output corresponds to one logit for a character in the vocabulary, representing the log-likelihood of each character as predicted by the model.

This architectural design is frequently applied in natural language processing endeavors such as text generation or language modeling. The process involves employing an embedding layer to transform input words into vectors, utilizing a GRU layer to capture sequential patterns, and incorporating a dense layer to generate the ultimate output by producing logits that predict the log-likelihood of the succeeding character.

In [18]:
class MyModel(tf.keras.Model):
    def __init__(self, size_of_vocabulary, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(size_of_vocabulary, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
        self.dense = tf.keras.layers.Dense(size_of_vocabulary)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [19]:
model = MyModel(
    size_of_vocabulary=size_of_vocabulary,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [20]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, size_of_vocabulary)")

(64, 50, 43) # (batch_size, sequence_length, size_of_vocabulary)


In [21]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  11008     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  44075     
                                                                 
Total params: 3993387 (15.23 MB)
Trainable params: 3993387 (15.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# sampling random indices for each prediction in the first sequence of your batch and
# then converting the result to a NumPy array for easier manipulation.
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [23]:
sampled_indices

array([ 6, 13,  2, 15, 24,  7, 20, 22, 42, 35, 23, 16, 40, 13, 37, 20,  1,
        1, 31, 13, 21, 35, 23, 17, 17, 22,  5, 16, 13,  7, 25,  2, 38, 39,
       10, 22, 20, 41, 34, 13, 14, 34, 11,  3, 27, 17, 30, 24, 21,  7])

In [24]:
# Decode these to see the text predicted by this untrained model:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'a shh oh oh de corazones whoa whoa whoa yeah   oka'

Next Char Predictions:
 b"29 \\h3dfzsg]x9ud\n\no9esgaaf1]93i vw6fdyr9[r7'kanhe3"


Train the model

The loss indicates how well the model's predictions match the actual labels, and minimizing this loss is the objective during the training of a machine learning model.

Adam is an optimization algorithm used to minimize the loss function during training. It adjusts the model's weights to improve its performance.

In [25]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [26]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, size_of_vocabulary)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 50, 43)  # (batch_size, sequence_length, size_of_vocabulary)
Mean loss:         tf.Tensor(3.7601483, shape=(), dtype=float32)


In [27]:
tf.exp(example_batch_mean_loss).numpy()

42.954796

In [28]:
model.compile(optimizer='adam', loss=loss)

This piece of code compiles a neural network model, establishes a directory for storing checkpoints, and sets up a callback mechanism to save the model's weights after each training epoch. The utilization of checkpoints is valuable for various purposes, such as resuming training from a specific point, assessing the model's performance at different stages, or deploying the model for inference tasks. Checkpoints essentially serve as snapshots of the model's parameters, providing flexibility and functionality for training, evaluation, and deployment scenarios.

In [29]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [30]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [31]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, char_id, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.char_id = char_id

    # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.char_id(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
        # Match the shape to the vocabulary
            dense_shape=[len(char_id.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.char_id(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
        return predicted_chars, states

In [32]:
one_step_model = OneStep(model, chars_from_ids, char_id)

In [33]:
start = time.time()
states = None
next_char = tf.constant([singer])
result = [next_char]

spaces = 0

for n in range(500):
  if spaces == 8:
    spaces = 0
    result.append("\n")
  else:
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    if next_char == ' ':
      spaces += 1
    result.append(next_char)


result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

coldplay coldplay 'lovigh the water that where i 
were abive with it starts there and not 
my lover shes just a girl who claix 
frmmmm hmmhmm hmmoh hmmhmm hmmhmm hmmhmm hmmhmm hmmoh 
hmmhmm hmmhmm hmmhmm hmmhmm hmmhmmhmmhmm'
 'i think i 
i think i love her  the reach 
inside we fire and yeaulife i toint love 
be with you always may gods love be 
with you      and 
ive got to get gull your shine  
 but i love this life'
 'just been 
wont be neme be counting up my demons 
yeah hoping everythings not lost  

________________________________________________________________________________

Run time: 2.1457390785217285


In [34]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from difflib import SequenceMatcher
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [35]:
result_array = result.numpy()[0]
result_string = result_array.decode('utf-8')
result_string.replace("\n", "")

# 1. BLEU Score (for evaluating novelty)
chencherry = SmoothingFunction()
bleu_score = sentence_bleu(content, result_string, smoothing_function=chencherry.method1)

print(f"BLEU Score: {float(bleu_score)}")

BLEU Score: 0.0008003179348548924


In [36]:
# 2. Sentiment Analysis
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
sentiment_score = sia.polarity_scores(result_string)['compound']
print(f"Sentiment Score: {sentiment_score}")

Sentiment Score: 0.9631


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [37]:
# 3. Rhyme and Rhythm (using SequenceMatcher)
rhyme_similarity = SequenceMatcher(None, result_string, content).ratio()
print(f"Rhyme and Rhythm Similarity: {format(float(rhyme_similarity), 'f')}")

Rhyme and Rhythm Similarity: 0.000322


In [38]:
# 4. TextBlob for Subjectivity
blob = TextBlob(result_string)
subjectivity_score = blob.sentiment.subjectivity
print(f"Subjectivity Score: {subjectivity_score}")

Subjectivity Score: 0.6


In [39]:
nltk.download('punkt')

# 5. Evaluation Against Existing Lyrics (using NLTK's word_tokenize)
reference_tokens = nltk.word_tokenize(content.lower())
generated_tokens = nltk.word_tokenize(result_string.lower())
overlap_score = len(set(reference_tokens) & set(generated_tokens)) / len(set(reference_tokens))
print(f"Overlap Score: {overlap_score}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Overlap Score: 0.014678409394182012
