In [1]:
!pip install librosa


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
# import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Dense, Concatenate

from sklearn.preprocessing import StandardScaler

### Image Preprocessing

In [18]:
import tensorflow as tf
from efficientnet_pytorch import EfficientNet
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import preprocess_input
import numpy as np
from tensorflow.keras.layers import GlobalAveragePooling2D


In [36]:
from transformers import BertModel, BertTokenizer


#### Album 

In [6]:
def preprocess_image(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    # Performs standardization and normalization
    img_array = preprocess_input(img_array)
    return img_array

X_image_album_cover = []
images_path = 'images'
for filename in os.listdir(images_path):  
    if filename.endswith('.jpg') or filename.endswith('.jpeg'):
        file_path = os.path.join(images_path, filename)
        X_image_album_cover.append(preprocess_image(file_path))
    
X_image_album_cover = np.array(X_image_album_cover)
print(X_image_album_cover.shape)


(10, 224, 224, 3)


#### Spec

In [7]:
X_image_spectogram = []
images_path = 'spec'
for filename in os.listdir(images_path):  
    if filename.endswith('.jpg') or filename.endswith('.jpeg'):
        file_path = os.path.join(images_path, filename)
        X_image_spectogram.append(preprocess_image(file_path))
    
X_image_spectogram = np.array(X_image_spectogram)
print(X_image_spectogram.shape)

(10, 224, 224, 3)


## Sample Text


In [42]:
lyrics_df = pd.read_csv('lyrics_cleaned.csv', nrows=10)
lyrics_df = np.array(lyrics_df.lyrics)
lyrics_df


array(["3 ContributorsJealous Kind of Fella Lyrics[Intro]\nJealous kind of fella… hey, hey, hey…\n\n[Verse 1]\nWhat a day, I think I'll call my baby today\n(Hello?) Hello baby, please don't be too mad at me\nBecause I punched that guy last night\nBut let me explain, before you say anything\nI know I was wrong, just like you said\nI'm let that jealousy go straight to my head\nAnd I apologize, so please don't cry\nI want you to know this is one sign that I love you\nLove you, I'm just a jealous kind of fella\n\n[Chorus]\nJealous kind of fella… hey, hey, hey…\n\n[Verse 2]\nYou see, here's another thing\nI know I embarrassed you, baby\nMm, and I'm so ashamed, yes I am, mama\nI realize ever since we've been together\nYou've given me more happiness than any man can claim\nYes you did, now\nIf you're still mad or upset\nThose things I can't accept\n'Cause it hurts me, don't desert me\nYou've got me out of my head\n'Cause I love you, love you\nI'm just a jealous kind of fella\nYou might also l

### Sound

In [14]:
folder_path = 'songs'
features = []

total_files = len([filename for filename in os.listdir(folder_path) if filename.endswith('.wav')])

with tqdm(total=total_files) as pbar:
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            y, sr = librosa.load(file_path)

            # Mel-frequency Cepstral coefficients
            mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
            # Chroma features
            chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
            # Spectral centroid
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            # Spectral bandwith
            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            # Spectral contrast
            spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
            # Spectral rolloff
            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            # Tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            # Zero crossing rate
            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))

            features.append(np.hstack([mfccs, chroma, spectral_centroid, spectral_bandwidth, 
                                  spectral_contrast, spectral_rolloff, tempo, zero_crossing_rate]))
            
            pbar.update(1)
            
mfcc_headers = [f'mfcc_{i}' for i in range(13)]
chroma_headers = [f'chroma_{i}' for i in range(12)]

columns = []
columns.extend(mfcc_headers)
columns.extend(chroma_headers)
columns.extend(['spectral_centroid', 'spectral_bandwidth', 'spectral_contrast', 'spectral_rolloff',
              'tempo', 'zero_crossing_rate'])

feature_df = pd.DataFrame(features, columns=columns)

feature_df.to_csv('sound_features.csv', index=False)


100%|███████████████████████████████████████████| 10/10 [00:28<00:00,  2.80s/it]


In [8]:
sound_features_df = pd.read_csv('sound_features.csv')


In [9]:
display(sound_features_df)

Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,...,chroma_8,chroma_9,chroma_10,chroma_11,spectral_centroid,spectral_bandwidth,spectral_contrast,spectral_rolloff,tempo,zero_crossing_rate
0,-119.883514,78.195465,-5.079293,16.115835,8.802312,8.788719,5.017749,8.297569,2.037281,5.438093,...,0.426102,0.442033,0.362053,0.400779,2379.809563,2533.935542,22.005264,5261.087265,129.199219,0.091814
1,-165.491364,89.887207,9.622631,28.824188,9.862348,0.715233,7.598889,1.015978,5.371005,-0.741413,...,0.247188,0.379182,0.52939,0.309625,2328.919113,2609.601348,23.821849,5068.501473,99.384014,0.092295
2,-263.305817,86.074142,11.548472,23.479252,3.635492,5.045878,4.839182,0.582274,7.817739,0.376333,...,0.331482,0.491414,0.322831,0.37197,2561.512959,2673.636869,23.458048,5472.573941,117.453835,0.119242
3,-49.181473,69.625359,-11.549077,13.836099,3.922445,7.283914,5.275508,4.021071,3.152122,5.375659,...,0.389022,0.45206,0.409418,0.471878,2618.028685,2511.308062,20.885303,5376.408294,161.499023,0.126535
4,-135.907242,101.794563,25.62307,23.123831,8.992074,4.891822,9.471533,-0.671894,0.736529,10.573339,...,0.385096,0.272601,0.253039,0.363305,2107.592914,2431.577165,23.659678,4736.722383,103.359375,0.082603
5,-124.01442,103.057861,-1.579663,8.649143,-8.500657,-6.627197,-20.605783,-9.192368,-21.916517,-8.039808,...,0.300686,0.167178,0.274421,0.170304,1946.668092,2203.037515,26.128935,4069.92749,123.046875,0.081782
6,-137.621063,84.09211,-8.891517,28.446003,3.140309,9.002275,5.252365,12.670362,3.9796,8.366396,...,0.39625,0.481415,0.600927,0.42678,2362.140776,2440.904377,21.228489,4973.706413,89.102909,0.099254
7,-195.491211,130.852936,19.90468,22.960194,7.628928,0.067079,7.393571,-0.221847,-2.634048,1.128535,...,0.376539,0.340813,0.275992,0.403166,1446.082976,1916.769463,23.550778,2843.806541,92.285156,0.05202
8,-38.100956,70.222244,-2.88067,17.448526,1.117762,12.198992,-0.531046,6.038986,0.556414,6.931718,...,0.421182,0.36076,0.467224,0.392254,2729.14616,2631.424067,22.012557,5741.808757,107.666016,0.126865
9,-107.875107,83.763847,-4.584987,15.001758,-4.840187,-4.34569,-0.965797,-4.735468,-2.55129,-1.484672,...,0.215571,0.407293,0.398789,0.235425,2354.484402,2448.346537,24.313879,4916.218943,117.453835,0.101199


In [10]:
# Normalize sound features
scaler = StandardScaler()
X_sound = scaler.fit_transform(sound_features_df)

In [11]:
display(pd.DataFrame(X_sound))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.221374,-0.666057,-0.689739,-0.585998,0.937192,0.877774,0.332603,1.087511,0.304205,0.500752,...,1.115767,0.655013,-0.248954,0.532259,0.271923,0.43585,-0.73249,0.521112,0.752904,-0.2534
1,-0.510053,0.007526,0.533089,1.441714,1.120276,-0.51543,0.645565,-0.12757,0.729885,-0.668856,...,-1.470396,-0.003071,1.273919,-0.517209,0.128329,0.787133,0.47584,0.27929,-0.728401,-0.231444
2,-2.078734,-0.212152,0.69327,0.58889,0.044804,0.231889,0.310952,-0.199943,1.042305,-0.457297,...,-0.251938,1.172055,-0.605893,0.200572,0.784623,1.084421,0.233852,0.786667,0.169359,0.999624
3,1.355245,-1.159797,-1.227862,-0.949747,0.094365,0.618097,0.363856,0.373891,0.446558,0.488935,...,0.57978,0.760002,0.182102,1.350834,0.94409,0.330801,-1.47745,0.665916,2.357651,1.332814
4,-0.035603,0.693531,1.86392,0.53218,0.969967,0.205304,0.872622,-0.409226,0.138114,1.472711,...,0.523037,-1.119018,-1.24105,0.100819,-0.496171,-0.039355,0.36797,-0.137311,-0.530894,-0.674199
5,0.155126,0.766312,-0.398659,-1.777365,-2.051293,-1.782479,-2.774236,-1.831041,-2.754426,-2.050238,...,-0.697097,-2.222852,-1.046459,-2.121251,-0.950242,-1.100363,2.010435,-0.974577,0.447238,-0.7117
6,-0.063088,-0.32634,-1.00682,1.381372,-0.040722,0.914626,0.36105,1.817201,0.552218,1.054998,...,0.684269,1.06736,1.924956,0.831614,0.222069,0.003947,-1.249174,0.16026,-1.239196,0.086477
7,-0.99117,2.367638,1.388295,0.50607,0.734531,-0.627279,0.62067,-0.334127,-0.292271,-0.314927,...,0.399337,-0.404806,-1.032159,0.559743,-2.362708,-2.429378,0.295533,-2.514166,-1.081093,-2.071366
8,1.532947,-1.125409,-0.506869,-0.373357,-0.390046,1.466269,-0.340185,0.710621,0.115115,0.783454,...,1.044646,-0.195954,0.708172,0.434114,1.257622,0.888446,-0.727639,1.124734,-0.316928,1.347867
9,0.413957,-0.345252,-0.648625,-0.763758,-1.419074,-1.38877,-0.392898,-1.087316,-0.281704,-0.809534,...,-1.927406,0.29127,0.085366,-1.371495,0.200465,0.038498,0.803122,0.088075,0.169359,0.175327


In [12]:
num_columns_X_sound = X_sound.shape[1]

In [13]:
num_genres = 3

In [14]:
# Dummy output values
y_genre = np.array([0, 1, 0, 1, 2, 2, 0, 0, 1, 0])
y_popularity = np.array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0])
y_danceability = np.array([0.23, 0.9, 0.4, 0.67, 0.5, 0.5, 0.3, 0.2, 0.12, 0.4])
y_energy = np.array([0.35, 0.56, 0.45, 0.75, 0.22, 0.9, 0.75, 0.3, 0.43, 0.5])

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

# TODO: Add text and image modality models
# SoundModalityModel creates the sound embeddings based on the SLP features
class SoundModalityModel(nn.Module):
    def __init__(self, num_columns_X_sound):
        super(SoundModalityModel, self).__init__()
        self.sound_embedding = nn.Sequential(
            nn.Linear(num_columns_X_sound, 64),
            nn.ReLU(),
            nn.Linear(64, 8),
            nn.ReLU()
        )

    def forward(self, input_sound):
        sound_embedding = self.sound_embedding(input_sound)
        return sound_embedding
    
    
class ImageModalityModel(nn.Module):
    def __init__(self, embedding_size=8):
        super(ImageModalityModel, self).__init__()
        self.efficientnet = EfficientNet.from_pretrained('efficientnet-b0')
        
        # Freeze all parameters of EfficientNet
        for param in self.efficientnet.parameters():
            param.requires_grad = False
        
        # Replace the final fully connected layer with a new one
        self.efficientnet._fc = nn.Linear(self.efficientnet._fc.in_features, embedding_size)

    def forward(self, input_image):
        # Forward pass through EfficientNet
        output = self.efficientnet(input_image)
        return output

# TODO: Fix BERT Model {?}

class TextModalityModel(nn.Module):
    def __init__(self, embedding_size=8, freeze_bert=True):
        super(TextModalityModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Freeze the BERT weights if freeze_bert is True
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        # Maps to size 8 embedding
        self.fc = nn.Linear(self.bert.config.hidden_size, embedding_size)

    def forward(self, input_text):
        # Tokenize input_text
        inputs = self.tokenizer(input_text.tolist(), return_tensors="pt", padding=True, truncation=True)
        
        # Forward pass through BERT
        outputs = self.bert(**inputs)
        
        # Extract the last hidden states
        last_hidden_states = outputs.last_hidden_state
        
        # Take the mean of the last hidden states (reduce it to 1 dimension)
        mean_last_hidden_state = torch.mean(last_hidden_states, dim=1)
        
        # Pass the mean last hidden states through the fully connected layer (maps to size 8 embedding)
        output = self.fc(mean_last_hidden_state)
        
        return output

    
# Overall model that combines the embeddings from the three modalities and produces 4 outputs
# Separate fully connected layers for genre, popularity, danceability, and energy
class OverallModel(nn.Module):
    def __init__(self, num_modalities, num_genres):
        super(OverallModel, self).__init__()
        self.concatenated_output_size = 8 * num_modalities
        self.fc1 = nn.Linear(self.concatenated_output_size, 64)
        self.fc2_genre = nn.Linear(64, num_genres)
        self.fc3_popularity = nn.Linear(64, 1)
        self.fc4_danceability = nn.Linear(64, 1)
        self.fc5_energy = nn.Linear(64, 1)

    def forward(self, sound_embedding, album_embedding, spec_embedding, text_embedding):
        # TODO: Concatenate with text and image embeddings
        concatenated_embedding = torch.cat((sound_embedding, album_embedding, spec_embedding, text_embedding), dim=1)
        x = torch.relu(self.fc1(concatenated_embedding))
        genre_output = torch.softmax(self.fc2_genre(x), dim=-1)
        popularity_output = torch.sigmoid(self.fc3_popularity(x))
        danceability_output = torch.sigmoid(self.fc4_danceability(x))
        energy_output = torch.sigmoid(self.fc5_energy(x))
        return genre_output, popularity_output, danceability_output, energy_output

num_modalities = 4 # TODO: Change to 3 once we have text and image modalities
num_genres = 3 

#TODO: Instantiate the remaining modality models
sound_model = SoundModalityModel(num_columns_X_sound)
album_model = ImageModalityModel()
spec_model = ImageModalityModel()
text_model = TextModalityModel()

overall_model = OverallModel(num_modalities, num_genres)

# TODO: Create optimizers for text and image modalities
# Create optimizers for each of our models (modality models as well as genre, popularity, danceability, energy)
optimizer_sound = optim.Adam(sound_model.parameters())
# optimizer_album = optim.Adam(album_model.parameters())
# optimizer_spec = optim.Adam(spec_model.parameters())

optimizer_genre = optim.Adam(overall_model.fc2_genre.parameters())
optimizer_popularity = optim.Adam(overall_model.fc3_popularity.parameters())
optimizer_danceability = optim.Adam(overall_model.fc4_danceability.parameters())
optimizer_energy = optim.Adam(overall_model.fc5_energy.parameters())

# Define the criteria
criterion_genre = nn.CrossEntropyLoss()
criterion_popularity = nn.MSELoss()
criterion_danceability = nn.MSELoss()
criterion_energy = nn.MSELoss()

# TODO: Form input data for text and image features
# Compile our datasets
X_sound = torch.tensor(X_sound, dtype=torch.float32).clone().detach()
X_album = torch.tensor(X_image_album_cover, dtype=torch.float32).clone().detach()
X_spec = torch.tensor(X_image_spectogram, dtype=torch.float32).clone().detach()

# lyrics_list = lyrics_df.tolist()
# print(lyrics_list)

# X_lyrics = torch.tensor(lyrics_list)


y_genre = torch.randint(0, num_genres, (10,))
y_popularity = torch.randn(10, 1) 
y_danceability = torch.randn(10, 1)
y_energy = torch.randn(10, 1)

# Define basic parameters before training
num_epochs = 10
batch_size = 4
num_samples = 10
num_batches = num_samples // batch_size

# Training loop
for epoch in range(num_epochs):
    for batch in range(num_batches):
        start = batch * batch_size
        end = min(start + batch_size, num_samples)
        
        # Get data for this batch
        batch_X_sound = X_sound[start:end]
        batch_X_album = X_album[start:end]
        batch_X_album = batch_X_album.permute(0, 3, 1, 2)
        batch_X_spec = X_spec[start:end]
        batch_X_spec = batch_X_spec.permute(0, 3, 1, 2)
        
        batch_X_lyric = lyrics_df[start:end]

        batch_y_genre = y_genre[start:end]
        batch_y_popularity = y_popularity[start:end]
        batch_y_danceability = y_danceability[start:end]
        batch_y_energy = y_energy[start:end]

        # Zero the gradients
        optimizer_sound.zero_grad()
#         optimizer_album.zero_grad()
#         optimizer_spec.zero_grad()


        optimizer_genre.zero_grad()
        optimizer_popularity.zero_grad()
        optimizer_danceability.zero_grad()
        optimizer_energy.zero_grad()
        
        # TODO: Create embeddings for text and image
        # Retrieve the four outputs given our embeddings
        sound_embedding = sound_model(batch_X_sound)
        album_embedding = album_model(batch_X_album)
        spec_embedding = spec_model(batch_X_spec)
        text_embedding = text_model(batch_X_lyric)

        
        genre_output, popularity_output, danceability_output, energy_output = overall_model(sound_embedding, 
                                                                                            album_embedding, 
                                                                                            spec_embedding, 
                                                                                           text_embedding)
        
        # Calculate individual losses
        loss_genre = criterion_genre(genre_output, batch_y_genre)
        loss_popularity = criterion_popularity(popularity_output, batch_y_popularity)
        loss_danceability = criterion_danceability(danceability_output, batch_y_danceability)
        loss_energy = criterion_energy(energy_output, batch_y_energy)
        
        # Backpropagate through the genre, popularity, danceability, and energy layers using their respective losses
        # We retain graph because we are using the computation graph for multiple backprops
        loss_genre.backward(retain_graph=True)
        optimizer_genre.step()

        loss_popularity.backward(retain_graph=True)
        optimizer_popularity.step()
        
        loss_danceability.backward(retain_graph=True)
        optimizer_danceability.step()
        
        loss_energy.backward(retain_graph=True)
        optimizer_energy.step()
        
        # TODO: Update parameters for the text and image models as well
        # Backpropagate through the sound modality model using the total loss
        optimizer_sound.step()
#         optimizer_album.step()
#         optimizer_spec.step()

    print("Epoch " + str(epoch+1))
    print("Genre loss: " + str(loss_genre.item()))
    print("Popularity loss: " + str(loss_popularity.item()))
    print("Danceability loss: " + str(loss_danceability.item()))
    print("Energy loss: " + str(loss_energy.item()))
    print("")

Loaded pretrained weights for efficientnet-b0
Loaded pretrained weights for efficientnet-b0


  X_sound = torch.tensor(X_sound, dtype=torch.float32).clone().detach()


Epoch 1
Genre loss: 1.0888416767120361
Popularity loss: 0.7466576099395752
Danceability loss: 1.9350122213363647
Energy loss: 1.639693260192871

Epoch 2
Genre loss: 1.08932363986969
Popularity loss: 0.7531051635742188
Danceability loss: 1.9367187023162842
Energy loss: 1.6261415481567383

Epoch 3
Genre loss: 1.088869571685791
Popularity loss: 0.7381604909896851
Danceability loss: 1.9214649200439453
Energy loss: 1.6283681392669678

Epoch 4
Genre loss: 1.0805368423461914
Popularity loss: 0.7420570850372314
Danceability loss: 1.9064456224441528
Energy loss: 1.6250003576278687

Epoch 5
Genre loss: 1.0839903354644775
Popularity loss: 0.7485537528991699
Danceability loss: 1.8995906114578247
Energy loss: 1.6358678340911865

Epoch 6
Genre loss: 1.0823864936828613
Popularity loss: 0.733913779258728
Danceability loss: 1.8849679231643677
Energy loss: 1.6148147583007812

Epoch 7
Genre loss: 1.078936219215393
Popularity loss: 0.748701274394989
Danceability loss: 1.8716822862625122
Energy loss: 1.618

In [48]:
# Create embeddings for text and image
sound_embedding_test = sound_model(X_sound)
X_album = X_album.permute(0, 3, 1, 2)
X_spec = X_spec.permute(0, 3, 1, 2)

album_embedding_test = album_model(X_album)
spec_embedding_test = spec_model(X_spec)
text_embedding_test = text_model(lyrics_df)

# Perform forward pass through overall model
genre_output_test, popularity_output_test, danceability_output_test, energy_output_test = overall_model(sound_embedding_test, 
                                                                                                       album_embedding_test, 
                                                                                                       spec_embedding_test, 
                                                                                                       text_embedding_test)

# Convert outputs to probabilities or appropriate format if needed
# For example, if genre_output_test is one-hot encoded, you might want to convert it to actual genre labels
genre_predictions = torch.argmax(genre_output_test, dim=1).tolist()
popularity_predictions = popularity_output_test.tolist()
danceability_predictions = danceability_output_test.tolist()
energy_predictions = energy_output_test.tolist()

# Print or use the predictions as needed
print("Genre predictions:", genre_predictions)
print("Popularity predictions:", popularity_predictions)
print("Danceability predictions:", danceability_predictions)
print("Energy predictions:", energy_predictions)

Genre predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Popularity predictions: [[0.49896806478500366], [0.5022054314613342], [0.49705970287323], [0.5221583843231201], [0.5058009624481201], [0.5406425595283508], [0.503486692905426], [0.510227620601654], [0.5116953253746033], [0.5268958210945129]]
Danceability predictions: [[0.4966847598552704], [0.4805973172187805], [0.49706459045410156], [0.48412084579467773], [0.49152249097824097], [0.4405095875263214], [0.4969162344932556], [0.4909771680831909], [0.5012988448143005], [0.47026383876800537]]
Energy predictions: [[0.49042433500289917], [0.48432356119155884], [0.49772149324417114], [0.4847020208835602], [0.4943593740463257], [0.4746120274066925], [0.47858351469039917], [0.48022615909576416], [0.4780304431915283], [0.4948614239692688]]
