### Data Processing Script

> Note: This code should not be needed and is only here in case you want to download and reprocess
> the original data from Kaggle. If you do please visit [https://www.kaggle.com/datasets/jorgeruizdev/ludwig-music-dataset-moods-and-subgenres/](https://www.kaggle.com/datasets/jorgeruizdev/ludwig-music-dataset-moods-and-subgenres/).

In [None]:
# import the needed modules
from IPython.display import Audio as player
from datasets import load_dataset, Audio
from panns_inference import AudioTagging
from qdrant_client import QdrantClient
from qdrant_client.http import models
from os.path import join
from glob import glob
import pandas as pd
import numpy as np
import librosa
import openl3
import torch
import os

In [None]:
# retrieve all the mp3s for all the genres
data_root = "./data"
mp3_data = []
mp3_root = join(data_root, "mp3", "mp3")
for genre in os.listdir(mp3_root):
    gdir = join(mp3_root, genre)
    for mp3 in os.listdir(gdir):
        mp3_data.append(join(gdir,mp3))

print(len(mp3_data))
print(mp3_data[:5])

In [None]:
# NOTE: this will run for quite some time (5-20m)
# load part of the data from the filesystem
music_data = load_dataset("audiofolder", data_files=mp3_data, split="train", drop_labels=True)
print("loaded dataset....")
# use the filenames as ids
ids = [
    (
     music_data[i] # for every sample
     ['audio'] # in this directory
     ['path'] # extract the path
     .split("/") # split it by /
     [-1] # take only the last piece "id.mp3"
     .replace(".mp3", '') # and replace the .mp3 with nothing
    ) 
    for i in range(len(music_data))    
]

index = [num for num in range(len(music_data))]
music_data = music_data.add_column("index", index)
music_data = music_data.add_column("ids", ids)
music_data[-1]

In [None]:
# persist the dataset as Apache Arrow
music_data.save_to_disk("./data/complete_music_data_set.arrow")

In [None]:
# here we build the metadata
# load the raw data
label_path = join(data_root, "labels.json")
labels = pd.read_json(label_path)

# create a helper function to explode/split out the data to get it into a prettier dataframe
def get_metadata(x):
    cols = ['artist', 'genre', 'name', 'subgenres']
    list_of_cols = []
    for col in cols:
        try:
            mdata = list(x[col].values())[0]
        except:
            mdata = "Unknown"
        list_of_cols.append(mdata)

    return pd.Series(list_of_cols, index=cols)
# apply the helper function to the right column in the dataframe
clean_labels = labels['tracks'].apply(get_metadata).reset_index()

# create a helper to explode the sub genres
def get_vals(genres):
    genre_list = []
    for dicts in genres:
        if type(dicts) != str:
            for _, val in dicts.items():
                genre_list.append(val)
    return genre_list
# apply it
clean_labels['subgenres'] = clean_labels.subgenres.apply(get_vals)

# build a list of the file locations
ids = [i.split('/')[-1].replace(".mp3", '') for i in mp3_data]
music_paths = pd.DataFrame(zip(ids, mp3_data), columns=["ids", 'urls'])

# now join the music_data, with the labels and the paths all into one data frame to use for the next stage
metadata = (music_data.select_columns(['index', 'ids'])
                     .to_pandas()
                     .merge(right=clean_labels, how="left", left_on='ids', right_on='index')
                     .merge(right=music_paths, how="left", left_on='ids', right_on='ids')
                     .drop("index_y", axis=1)
                     .rename({"index_x": "index"}, axis=1)
           )
# drop a few columns and then save it as a json
metadata.drop(['index', 'ids'], axis=1).to_dict(orient="records")
metadata.to_json("./data/metatdata_complete_music_data_set.json")