CRNN Model for Music Embeddings

In [8]:
# STEP 1: Imports
import sqlite3
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Dropout, GRU, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-07-08 20:57:06.246561: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-08 20:57:07.523838: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-08 20:57:16.516659: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-08 20:57:23.856973: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752001050.531500  339459 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752001052.24

In [9]:
# STEP 2: Connect to SQLite database
engine = create_engine("postgresql://postgres:Stefan%401@127.0.0.1:5432/musicdb", echo=False)

# 1. open a connection
with engine.connect() as conn:
    # 2. read any SQL you like into a DataFrame
    df = pd.read_sql("SELECT * FROM track LIMIT 100", conn)

df.head()

Unnamed: 0,id,track_id,title,artist_id,genre_id,duration,year,artist_familiarity,tempo,key,mode,time_signature,search_vector
0,20,TRAABXG128F9318EBD,Synthetic Dream,1486,2,165.69424,,0.320314,138.331,7,1,4,"'dream':2,5 'lextric':3 'lextrical':6 'synthet..."
1,21,TRAABYN12903CFD305,Broken-Down Merry-Go-Round,1960,9,151.84934,,0.394139,86.186,1,1,3,"'broken':2,11 'broken-down':1,10 'down':12 'go..."
2,22,TRAABYW128F4244559,Kassie Jones,726,8,220.78649,,0.489816,110.382,11,1,1,'alic':3 'alice':7 'jone':2 'jones':6 'kassi':...
3,23,TRAACCG128F92E8A55,Setanta matins,1040,2,269.58322,,0.67742,111.874,2,1,4,"'elena':3,6 'matin':2 'matins':5 'setanta':1,4"
4,24,TRAACER128F4290F96,Setting Fire to Sleeping Giants,2706,5,207.77751,2004.0,0.839963,166.862,7,1,4,'dilling':7 'dillinger':16 'escap':8 'escape':...


In [None]:
# STEP 3: Load segment data (with genre for labels)
query = """
SELECT
  s.track_id AS track_pk,
  s.segment_index,
  s.timbre_0, s.timbre_1, …, s.timbre_11,
  s.pitch_0,  s.pitch_1,  …, s.pitch_11,
  s.loudness_max,
  s.confidence
FROM segment AS s
  JOIN track AS t
    ON s.track_id = t.id
ORDER BY
  s.track_id,
  s.segment_index;
"""

df = pd.read_sql_query(query, conn)


KeyboardInterrupt



In [None]:
# STEP 4: Preprocess features
FEATURE_COLUMNS = [col for col in df.columns if col.startswith("timbre_") or col.startswith("pitch_")] + ["loudness_max", "confidence"]
feature_dim = len(FEATURE_COLUMNS)  # should be 26

In [None]:
# STEP 5: Group by trackID and build fixed-length sequences
grouped = df.groupby("trackID")
X = []
y = []
num_segments = 500

for track_id, group in grouped:
    features = group[FEATURE_COLUMNS].to_numpy()
    label = group["genre"].iloc[0]
    
    if features.shape[0] < num_segments:
        # Pad
        padded = np.pad(features, ((0, num_segments - features.shape[0]), (0, 0)), mode='constant')
    else:
        # Truncate
        padded = features[:num_segments]

    X.append(padded)
    y.append(label)

X = np.array(X)  # shape = (num_tracks, 500, 26)

In [None]:
# STEP 6: Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)
num_classes = y_categorical.shape[1]

In [None]:
# STEP 7: Build the CRNN model
inputs = Input(shape=(num_segments, feature_dim))

x = Conv1D(64, kernel_size=3, padding='same', activation='elu')(inputs)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)

x = Conv1D(128, kernel_size=3, padding='same', activation='elu')(x)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)

x = Conv1D(128, kernel_size=3, padding='same', activation='elu')(x)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)

x = GRU(64, return_sequences=True)(x)
x = GRU(64)(x)

outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# STEP 8: Train the model
history = model.fit(X, y_categorical, batch_size=32, epochs=20, validation_split=0.2)