In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

keras.utils.set_random_seed(42)

In [2]:
drive.mount('/content/drive')


# Read data from URL
# Note:
# Create a XPro subfolder in google drive
# In that folder add the three .csv files.

train_df = pd.read_csv('/content/drive/MyDrive/MIT/Python/NLP/lyric_genre_train.csv', index_col=0).astype(str)
test_df = pd.read_csv('/content/drive/MyDrive/MIT/Python/NLP/lyric_genre_test.csv', index_col=0).astype(str)
val_df = pd.read_csv('/content/drive/MyDrive/MIT/Python/NLP/lyric_genre_val.csv', index_col=0).astype(str)

print(f"""
Train samples: {train_df.shape[0]}
Validation samples: {val_df.shape[0]}
Test samples: {test_df.shape[0]}
""")

Mounted at /content/drive

Train samples: 48991
Validation samples: 16331
Test samples: 21774



In [3]:
train_df.head()

Unnamed: 0,Lyric,Genre
0,"Oh, girl. I can't get ready (Can't get ready f...",Pop
1,We met on a rainy evening in the summertime. D...,Pop
2,We carried you in our arms. On Independence Da...,Rock
3,I know he loved you. A long time ago. I ain't ...,Pop
4,Paralysis through analysis. Yellow moral uncle...,Rock


In [4]:
# Let's check the proportion of each label on training

train_df['Genre'].value_counts() / train_df.shape[0]

Genre
Rock       0.549448
Pop        0.295136
Hip Hop    0.155416
Name: count, dtype: float64

In [5]:
# Let's turn the target into a dummy vector

y_train = pd.get_dummies(train_df['Genre']).to_numpy()
y_val = pd.get_dummies(val_df['Genre']).to_numpy()
y_test = pd.get_dummies(test_df['Genre']).to_numpy()

In [6]:
# First, we set up our Text Vectorization layer using multi-hot encoding

max_tokens = 5000
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot")

In [7]:
# The vocabulary that will be indexed is given by the text corpus on our train dataset
text_vectorization.adapt(train_df['Lyric'])

In [8]:
#Let's look at the 20 least common words in our vectorization

text_vectorization.get_vocabulary()[-20:]

['eden',
 'dagger',
 'curve',
 'cheddar',
 'brew',
 'appears',
 'vacant',
 'universal',
 'unholy',
 'terrified',
 'stickin',
 'rumble',
 'rug',
 'pam',
 'os',
 'ooohh',
 'motto',
 'marshall',
 'loyalty',
 'legacy']

In [9]:
# We vectorize our input

X_train = text_vectorization(train_df['Lyric'])
X_val = text_vectorization(val_df['Lyric'])
X_test = text_vectorization(test_df['Lyric'])

In [12]:
#downloading the GloVe word embedding that contains 100-dimensional embedding vectors of 400,000 words. GloVe was precomputed on the 2014 English Wikipedia dataset
#
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-07-07 14:42:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-07-07 14:42:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-07-07 14:42:42--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [13]:
embedding_dim = 100
path_to_glove_file = f"glove.6B.{embedding_dim}d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [14]:
#load the GloVe embeddings into the model and train it!

max_length = 300 #90% of songs have less than 300 words
max_tokens = 5000

text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

In [15]:
text_vectorization.adapt(train_df['Lyric'])

In [16]:
X_train = text_vectorization(train_df['Lyric'])
X_val = text_vectorization(val_df['Lyric'])
X_test = text_vectorization(test_df['Lyric'])

In [17]:
#Since we want to use the downloaded GLoVE embeddings, we will first build a matrix of shape (max_words, embedding_dim) and fix it as the embeddings of the Embedding layer.

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

counter = 0
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
  if i < max_tokens:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
  else:
    counter += 1

In [19]:
#Set trainable=False to freeze the layer to avoid disrupting these pretrained GLoVE embeddings during training.
#Modify Trainable=True to see how GloVe embeddings can be trained using our input. Check accuracy improvements
embedding_layer = keras.layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer= keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

In [20]:
#build a Neural Network with an embedding layer after its input layer.
#This layer will use the already computed Glove embeddings to convert each word in the document into a dense vector.
#Thus, each text will be represented by a matrix.

inputs = keras.Input(shape=(max_length,))
embedded = embedding_layer(inputs) # 300 x 100 table comes out
embedded = keras.layers.GlobalAveragePooling1D()(embedded) # 100-element vector
x = keras.layers.Dense(8)(embedded)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(3, activation="softmax")(x)

model = keras.Model(inputs, outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 300, 100)          500000    
                                                                 
 global_average_pooling1d (  (None, 100)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 8)                 808       
                                                                 
 dropout (Dropout)           (None, 8)                 0         
                                                                 
 dense_1 (Dense)             (None, 3)                 27        
                                                             

In [21]:
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

In [22]:
# Fit model
model.fit(x=X_train, y=y_train,
          validation_data=(X_val, y_val),
          epochs=10,
          batch_size=32,)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ea88f637220>

In [23]:
model.evaluate(x=X_test,y=y_test)



[0.6407549381256104, 0.7353265285491943]

In [24]:
def lyric_predict(phrase):
    raw_text_data = tf.convert_to_tensor([[phrase],])

    print(raw_text_data)

    vect_data = text_vectorization(raw_text_data)
    predictions = model.predict(vect_data)
    predictions
    print(f"{float(predictions[0,0] * 100):.2f} % Hip-Hop")
    print(f"{float(predictions[0,1] * 100):.2f} % Pop")
    print(f"{float(predictions[0,2] * 100):.2f} % Rock")

In [26]:
phrase = '''Roots, bloody roots
Roots, bloody roots
Roots, bloody roots
Roots, bloody roots

I believe in our fate, we don't need to fake
It's all we wanna be, watch me freak!

I say we're growing every day, getting stronger in every way
I'll take you to a place where we shall find our...

Roots, bloody roots
Roots, bloody roots
Roots, bloody roots
Roots, bloody roots

Rain, bring me the strength to get to another day
And all I want to see, set us free

Why can't you see? Can't you feel?
This is real!

I pray, we don't need to change our ways to be saved
That's all we wanna be, watch us freak!'''

lyric_predict(phrase)

tf.Tensor([[b"Roots, bloody roots\nRoots, bloody roots\nRoots, bloody roots\nRoots, bloody roots\n\nI believe in our fate, we don't need to fake\nIt's all we wanna be, watch me freak!\n\nI say we're growing every day, getting stronger in every way\nI'll take you to a place where we shall find our...\n\nRoots, bloody roots\nRoots, bloody roots\nRoots, bloody roots\nRoots, bloody roots\n\nRain, bring me the strength to get to another day\nAnd all I want to see, set us free\n\nWhy can't you see? Can't you feel?\nThis is real!\n\nI pray, we don't need to change our ways to be saved\nThat's all we wanna be, watch us freak!"]], shape=(1, 1), dtype=string)
5.67 % Hip-Hop
9.73 % Pop
84.60 % Rock
