data: https://ddmal.music.mcgill.ca/research/The_McGill_Billboard_Project_(Chord_Analysis_Dataset)/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from pathlib import Path
import re
from collections import Counter
import pickle
from tqdm import tqdm
import random

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Conv1D, MaxPooling1D, Flatten, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# CONSTANTS

EPOCHS = 50
SAMPLE_SIZE = 890
BATCH_SIZE = 256
SEQUENCE_LENGTH = 5

In [21]:
# Set the paths
data_dir = Path('data/McGill-Billboard')
chordino_dir = data_dir / 'chordino'
lab_dir = data_dir / 'lab'
annotations_dir = data_dir / 'annotations'
index_path = index_path = data_dir / 'index.csv'


In [22]:
# Load the dataset index
index_df = pd.read_csv(index_path)
print(f"Total entries in index: {len(index_df)}")
print(f"Entries with complete data: {index_df['title'].notna().sum()}")


Total entries in index: 1300
Entries with complete data: 890


In [23]:
# Drop entries with missing incomplete/unavailable data
index_df = index_df.dropna(subset=['title'])

# Verify new dataframe
print(f'Total Entries after cleaning: {len(index_df)}')

Total Entries after cleaning: 890


In [24]:
index_df.head()

Unnamed: 0,id,chart_date,target_rank,actual_rank,title,artist,peak_rank,weeks_on_chart
2,3,1961-07-03,56,57.0,I Don't Mind,James Brown,47.0,8.0
3,4,1971-08-07,32,31.0,You've Got A Friend,"Roberta Flack,Donny Hathaway",29.0,12.0
5,6,1980-08-02,15,16.0,The Rose,Bette Midler,3.0,25.0
9,10,1984-03-24,49,51.0,An Innocent Man,Billy Joel,10.0,18.0
11,12,1980-11-15,65,63.0,Lookin' For Love,Johnny Lee,5.0,21.0


In [25]:
# Display basic information
print(f"Number of unique songs: {index_df['title'].nunique()}")
print(f"Number of unique artists: {index_df['artist'].nunique()}")

Number of unique songs: 732
Number of unique artists: 420


In [26]:
# Convert chart_date to datetime
index_df['chart_date'] = pd.to_datetime(index_df['chart_date'])

# Extract year and decade
index_df['year'] = index_df['chart_date'].dt.year
index_df['decade'] = (index_df['year'] // 10) * 10

In [27]:
decade_counts = index_df['decade'].value_counts().sort_index()

print(decade_counts)

decade
1950     27
1960    252
1970    316
1980    252
1990     43
Name: count, dtype: int64


In [28]:
# Check available directories
song_ids = [d.name for d in chordino_dir.iterdir() if d.is_dir()]
print(f"Number of songs with chroma features: {len(song_ids)}")

# Check if the same songs have lab files
lab_ids = [d.name for d in lab_dir.iterdir() if d.is_dir()]
print(f"Number of songs with lab files: {len(lab_ids)}")

# Find songs that have both chroma and labels
common_ids = set(song_ids).intersection(set(lab_ids))
print(f"Number of songs with both features and labels: {len(common_ids)}")

Number of songs with chroma features: 890
Number of songs with lab files: 890
Number of songs with both features and labels: 890


In [29]:
# Explore one example
example_id = list(common_ids)[random.randint(0, len(common_ids))]

# Load chroma features for the example song
chroma_path = chordino_dir / example_id / 'bothchroma.csv'
tuning_path = chordino_dir / example_id / 'tuning.csv'

if chroma_path.exists() and tuning_path.exists():
    chroma_data = pd.read_csv(chroma_path, header=None)
    tuning_data = pd.read_csv(tuning_path, header=None)
    
    print(f"\nChroma shape for song {example_id}: {chroma_data.shape}")
    print(f"Tuning shape for song {example_id}: {tuning_data.shape}")
    
    # Display a sample of the chroma data
    print("\nSample of chroma data:")
    print(chroma_data.head())
else:
    print(f"Chroma or tuning data not found for song {example_id}")

# Load chord labels for the example song
lab_path = lab_dir / example_id / "full.lab"

if lab_path.exists():
    # Read the lab file (tab-separated with no header)
    lab_data = pd.read_csv(lab_path, sep='\t', header=None, names=['start_time', 'end_time', 'chord'])
    
    print(f"\nNumber of chord segments for song {example_id}: {len(lab_data)}")
    print("\nSample of chord labels:")
    print(lab_data.head())
    
    # Count the unique chords in this song
    print(f"\nNumber of unique chords in song {example_id}: {lab_data['chord'].nunique()}")
    print("\nMost common chords:")
    print(lab_data['chord'].value_counts().head(10))
else:
    print(f"Lab file not found for song {example_id}")


Chroma shape for song 0468: (3322, 26)
Tuning shape for song 0468: (1, 5)

Sample of chroma data:
               0        1    2    3    4    5    6    7    8    9   ...   16  \
0  /tmp/audio.wav  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
1             NaN  0.04644  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
2             NaN  0.09288  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
3             NaN  0.13932  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
4             NaN  0.18576  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   

    17   18   19   20   21   22   23   24   25  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 26 columns]

Number of chord segments for song 0468: 129

Sample of chord labels:
   start_time  end_time chord
0  

In [30]:
print(f"Chroma shape for song {example_id}: {chroma_data.shape}")
print("\nSample of chroma data:")
print(chroma_data.head())

Chroma shape for song 0468: (3322, 26)

Sample of chroma data:
               0        1    2    3    4    5    6    7    8    9   ...   16  \
0  /tmp/audio.wav  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
1             NaN  0.04644  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
2             NaN  0.09288  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
3             NaN  0.13932  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
4             NaN  0.18576  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   

    17   18   19   20   21   22   23   24   25  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 26 columns]


In [31]:
# Load chord labels
lab_data = pd.read_csv(lab_dir / example_id / "full.lab", sep='\t', names=['start', 'end', 'chord'])

print(f"Number of chord segments for song {example_id}: {len(lab_data)}")
print("\nSample of chord segments:")
print(lab_data.head())

Number of chord segments for song 0468: 129

Sample of chord segments:
      start       end chord
0  0.000000  0.464399     N
1  0.464399  1.202543     N
2  1.202543  1.448592     N
3  1.448592  3.416977     N
4  3.416977  4.401169     N


In [32]:
# Count unique chords
unique_chords = lab_data['chord'].nunique()
print(f"Number of unique chords in song {example_id}: {unique_chords}")

Number of unique chords in song 0468: 4


In [33]:
# Most frequent chords
most_common_chords = lab_data['chord'].value_counts().head(10)
print("Most common chords:")
print(most_common_chords)

Most common chords:
chord
C:maj    60
F:maj    47
N        15
G:maj     7
Name: count, dtype: int64
