In [1]:
!pip install librosa


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
import os
# import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Dense, Concatenate

from sklearn.preprocessing import StandardScaler

### Image Preprocessing

In [41]:
import tensorflow as tf
from efficientnet.tfkeras import EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import preprocess_input
import numpy as np
from tensorflow.keras.layers import GlobalAveragePooling2D


#### Album 

In [85]:
def preprocess_image(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    # Performs standardization and normalization
    img_array = preprocess_input(img_array)
    return img_array

X_image_album_cover = []
images_path = 'images'
for filename in os.listdir(images_path):  
    if filename.endswith('.jpg') or filename.endswith('.jpeg'):
        file_path = os.path.join(images_path, filename)
        X_image_album_cover.append(preprocess_image(file_path))
    
X_image_album_cover = np.array(X_image_album_cover)
print(X_image_album_cover.shape)


(10, 224, 224, 3)


#### Spec

In [86]:
X_image_spectogram = []
images_path = 'spec'
for filename in os.listdir(images_path):  
    if filename.endswith('.jpg') or filename.endswith('.jpeg'):
        file_path = os.path.join(images_path, filename)
        X_image_spectogram.append(preprocess_image(file_path))
    
X_image_spectogram = np.array(X_image_spectogram)
print(X_image_spectogram.shape)

(10, 224, 224, 3)


### Sound

In [3]:
folder_path = 'songs'
features = []

total_files = len([filename for filename in os.listdir(folder_path) if filename.endswith('.wav')])

with tqdm(total=total_files) as pbar:
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            y, sr = librosa.load(file_path)

            # Mel-frequency Cepstral coefficients
            mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
            # Chroma features
            chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
            # Spectral centroid
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            # Spectral bandwith
            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            # Spectral contrast
            spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
            # Spectral rolloff
            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            # Tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            # Zero crossing rate
            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))

            features.append(np.hstack([mfccs, chroma, spectral_centroid, spectral_bandwidth, 
                                  spectral_contrast, spectral_rolloff, tempo, zero_crossing_rate]))
            
            pbar.update(1)
            
mfcc_headers = [f'mfcc_{i}' for i in range(13)]
chroma_headers = [f'chroma_{i}' for i in range(12)]

columns = []
columns.extend(mfcc_headers)
columns.extend(chroma_headers)
columns.extend(['spectral_centroid', 'spectral_bandwidth', 'spectral_contrast', 'spectral_rolloff',
              'tempo', 'zero_crossing_rate'])

feature_df = pd.DataFrame(features, columns=columns)

feature_df.to_csv('sound_features.csv', index=False)


100%|███████████████████████████████████████████| 10/10 [00:26<00:00,  2.70s/it]


In [16]:
sound_features_df = pd.read_csv('sound_features.csv')


In [17]:
display(sound_features_df)

Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,...,chroma_8,chroma_9,chroma_10,chroma_11,spectral_centroid,spectral_bandwidth,spectral_contrast,spectral_rolloff,tempo,zero_crossing_rate
0,-119.883514,78.195465,-5.079293,16.115835,8.802312,8.788719,5.017749,8.297569,2.037281,5.438093,...,0.426102,0.442033,0.362053,0.400779,2379.809563,2533.935542,22.005264,5261.087265,129.199219,0.091814
1,-165.491364,89.887207,9.622631,28.824188,9.862348,0.715233,7.598889,1.015978,5.371005,-0.741413,...,0.247188,0.379182,0.52939,0.309625,2328.919113,2609.601348,23.821849,5068.501473,99.384014,0.092295
2,-263.305817,86.074142,11.548472,23.479252,3.635492,5.045878,4.839182,0.582274,7.817739,0.376333,...,0.331482,0.491414,0.322831,0.37197,2561.512959,2673.636869,23.458048,5472.573941,117.453835,0.119242
3,-49.181473,69.625359,-11.549077,13.836099,3.922445,7.283914,5.275508,4.021071,3.152122,5.375659,...,0.389022,0.45206,0.409418,0.471878,2618.028685,2511.308062,20.885303,5376.408294,161.499023,0.126535
4,-135.907242,101.794563,25.62307,23.123831,8.992074,4.891822,9.471533,-0.671894,0.736529,10.573339,...,0.385096,0.272601,0.253039,0.363305,2107.592914,2431.577165,23.659678,4736.722383,103.359375,0.082603
5,-124.01442,103.057861,-1.579663,8.649143,-8.500657,-6.627197,-20.605783,-9.192368,-21.916517,-8.039808,...,0.300686,0.167178,0.274421,0.170304,1946.668092,2203.037515,26.128935,4069.92749,123.046875,0.081782
6,-137.621063,84.09211,-8.891517,28.446003,3.140309,9.002275,5.252365,12.670362,3.9796,8.366396,...,0.39625,0.481415,0.600927,0.42678,2362.140776,2440.904377,21.228489,4973.706413,89.102909,0.099254
7,-195.491211,130.852936,19.90468,22.960194,7.628928,0.067079,7.393571,-0.221847,-2.634048,1.128535,...,0.376539,0.340813,0.275992,0.403166,1446.082976,1916.769463,23.550778,2843.806541,92.285156,0.05202
8,-38.100956,70.222244,-2.88067,17.448526,1.117762,12.198992,-0.531046,6.038986,0.556414,6.931718,...,0.421182,0.36076,0.467224,0.392254,2729.14616,2631.424067,22.012557,5741.808757,107.666016,0.126865
9,-107.875107,83.763847,-4.584987,15.001758,-4.840187,-4.34569,-0.965797,-4.735468,-2.55129,-1.484672,...,0.215571,0.407293,0.398789,0.235425,2354.484402,2448.346537,24.313879,4916.218943,117.453835,0.101199


In [18]:
# Normalize sound features
scaler = StandardScaler()
X_sound = scaler.fit_transform(sound_features_df)

In [19]:
display(pd.DataFrame(X_sound))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.221374,-0.666057,-0.689739,-0.585998,0.937192,0.877774,0.332603,1.087511,0.304205,0.500752,...,1.115767,0.655013,-0.248954,0.532259,0.271923,0.43585,-0.73249,0.521112,0.752904,-0.2534
1,-0.510053,0.007526,0.533089,1.441714,1.120276,-0.51543,0.645565,-0.12757,0.729885,-0.668856,...,-1.470396,-0.003071,1.273919,-0.517209,0.128329,0.787133,0.47584,0.27929,-0.728401,-0.231444
2,-2.078734,-0.212152,0.69327,0.58889,0.044804,0.231889,0.310952,-0.199943,1.042305,-0.457297,...,-0.251938,1.172055,-0.605893,0.200572,0.784623,1.084421,0.233852,0.786667,0.169359,0.999624
3,1.355245,-1.159797,-1.227862,-0.949747,0.094365,0.618097,0.363856,0.373891,0.446558,0.488935,...,0.57978,0.760002,0.182102,1.350834,0.94409,0.330801,-1.47745,0.665916,2.357651,1.332814
4,-0.035603,0.693531,1.86392,0.53218,0.969967,0.205304,0.872622,-0.409226,0.138114,1.472711,...,0.523037,-1.119018,-1.24105,0.100819,-0.496171,-0.039355,0.36797,-0.137311,-0.530894,-0.674199
5,0.155126,0.766312,-0.398659,-1.777365,-2.051293,-1.782479,-2.774236,-1.831041,-2.754426,-2.050238,...,-0.697097,-2.222852,-1.046459,-2.121251,-0.950242,-1.100363,2.010435,-0.974577,0.447238,-0.7117
6,-0.063088,-0.32634,-1.00682,1.381372,-0.040722,0.914626,0.36105,1.817201,0.552218,1.054998,...,0.684269,1.06736,1.924956,0.831614,0.222069,0.003947,-1.249174,0.16026,-1.239196,0.086477
7,-0.99117,2.367638,1.388295,0.50607,0.734531,-0.627279,0.62067,-0.334127,-0.292271,-0.314927,...,0.399337,-0.404806,-1.032159,0.559743,-2.362708,-2.429378,0.295533,-2.514166,-1.081093,-2.071366
8,1.532947,-1.125409,-0.506869,-0.373357,-0.390046,1.466269,-0.340185,0.710621,0.115115,0.783454,...,1.044646,-0.195954,0.708172,0.434114,1.257622,0.888446,-0.727639,1.124734,-0.316928,1.347867
9,0.413957,-0.345252,-0.648625,-0.763758,-1.419074,-1.38877,-0.392898,-1.087316,-0.281704,-0.809534,...,-1.927406,0.29127,0.085366,-1.371495,0.200465,0.038498,0.803122,0.088075,0.169359,0.175327


In [20]:
num_columns_X_sound = X_sound.shape[1]

In [21]:
num_genres = 3

In [22]:
# Dummy output values
y_genre = np.array([0, 1, 0, 1, 2, 2, 0, 0, 1, 0])
y_popularity = np.array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0])
y_danceability = np.array([0.23, 0.9, 0.4, 0.67, 0.5, 0.5, 0.3, 0.2, 0.12, 0.4])
y_energy = np.array([0.35, 0.56, 0.45, 0.75, 0.22, 0.9, 0.75, 0.3, 0.43, 0.5])

In [88]:
# Sound modality model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from efficientnet.tfkeras import EfficientNetB0


input_sound = Input(shape=(num_columns_X_sound,), name='input_sound')
sound_layer1 = Dense(64, activation='relu', name='sound_embedding')(input_sound)
sound_embedding = Dense(8, activation='relu', name='sound_output')(sound_layer1)

# TODO: Text modality model
# input_text = Input(shape=(num_columns_X_text,), name='input_text')
# text_layer1 = Dense(64, activation='relu', name='text_embedding')(input_text)
# text_embedding = Dense(8, activation='relu', name='text_output')(text_layer1)

# Define input tensor for images album
input_album = Input(shape=(224, 224, 3), name='input_album')
# Load pre-trained EfficientNetB0 model without the top classification layer
efficientnet_model_album = EfficientNetB0(include_top=False, weights='imagenet')
# Freeze all layers in the EfficientNet model
for layer in efficientnet_model.layers:
    layer.trainable = False
# Pass the input tensor through the EfficientNet model to get the output
efficientnet_output_album = efficientnet_model_album(input_album)
# Apply Global Average Pooling to reduce spatial dimensions
pooling_output = GlobalAveragePooling2D()(efficientnet_output_album)
# Add a dense layer for further feature extraction
dense_layer_image = Dense(64, activation='relu', name='image_embedding_album')(pooling_output)
# Add another dense layer for the final output
image_embedding_album = Dense(8, activation='relu', name='image_output_album')(dense_layer_image)

# Define input tensor for images spectograms
input_spectogram = Input(shape=(224, 224, 3), name='input_spectogram')
# Load pre-trained EfficientNetB0 model without the top classification layer
efficientnet_model_spec = EfficientNetB0(include_top=False, weights='imagenet')
efficientnet_model_spectrogram_wrapper = tf.keras.Model(inputs=efficientnet_model_spec.input, 
                                                        outputs=efficientnet_model_spec.output, 
                                                        name='efficientnet_spectrogram')
# Freeze all layers in the EfficientNet model
for layer in efficientnet_model.layers:
    layer.trainable = False
# Pass the input tensor through the EfficientNet model to get the output
efficientnet_output_spec = efficientnet_model_spectrogram_wrapper(input_spectogram)
# Apply Global Average Pooling to reduce spatial dimensions
pooling_output = GlobalAveragePooling2D()(efficientnet_output_spec)
# Add a dense layer for further feature extraction
dense_layer_image = Dense(64, activation='relu', name='image_embedding_spec')(pooling_output)
# Add another dense layer for the final output
image_embedding_spec = Dense(8, activation='relu', name='image_output_spec')(dense_layer_image)

# Combine the embeddings
# concatenated_output = Concatenate()([sound_embedding, text_embedding, image_embeddng])
concatenated_output = Concatenate()([sound_embedding, image_embedding_album, image_embedding_spec])

# Four output layers
genre_output_layer = Dense(num_genres, activation='softmax', name='genre_output')(concatenated_output)
popularity_output_layer = Dense(1, activation='sigmoid', name='popularity_output')(concatenated_output)
danceability_output_layer = Dense(1, activation='sigmoid', name='danceability_output')(concatenated_output)
energy_output_layer = Dense(1, activation='sigmoid', name='energy_output')(concatenated_output)

In [89]:
# model = Model(inputs=[input_sound, input_text, input_image], outputs=[output_layer1, output_layer2, output_layer3, output_layer4])
model = Model(inputs=[input_sound, input_album, input_spectogram], outputs=[genre_output_layer, popularity_output_layer, danceability_output_layer, energy_output_layer])

model.compile(optimizer='adam', loss={'genre_output': 'sparse_categorical_crossentropy', 'popularity_output': 'mean_squared_error', 'danceability_output': 'mean_squared_error', 'energy_output': 'mean_squared_error'}, 
              metrics={'genre_output': 'accuracy', 'popularity_output': 'mean_squared_error', 'danceability_output': 'mean_squared_error', 'energy_output': 'mean_squared_error'})

# Train model
model.fit([X_sound, X_image_album_cover, X_image_spectogram], [y_genre, y_popularity, y_danceability, y_energy], epochs=10, batch_size=32, validation_split=0.2)

print(model.predict([X_sound, X_image_album_cover, X_image_spectogram]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[array([[1.1805683e-01, 8.8194317e-01, 6.8927680e-12],
       [7.0975549e-03, 9.9290252e-01, 2.5363576e-13],
       [5.7740878e-02, 9.4225919e-01, 1.9123999e-12],
       [7.7157893e-04, 9.9922848e-01, 3.5753375e-12],
       [2.7907804e-01, 7.2092199e-01, 6.1934013e-12],
       [6.8845763e-03, 9.9311548e-01, 1.8582200e-12],
       [3.3386368e-01, 6.6613632e-01, 2.1170730e-12],
       [4.4800611e-03, 9.9551988e-01, 1.0379741e-12],
       [1.7437108e-01, 8.2562894e-01, 3.9607692e-12],
       [2.0043531e-02, 9.7995651e-01, 6.9007369e-12]], dtype=float32), array([[2.5925408e-08],
       [1.2123601e-07],
       [3.4628894e-08],
       [9.3759624e-08],
       [2.9199299e-08],
       [8.3379618e-08],
       [2.5702958e-08],
       [9.9284087e-08],
       [2.9138935e-08],
       [1.8587057e-08]], dtype=float32), array([[0.9999999 ],
       [1.        ],
       [0.99999994],
       [1. 