In [34]:
import os
from dotenv import load_dotenv
import musicbrainzngs
import pprint
import requests
import lyricsgenius
import pandas as pd
import pprint
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


load_dotenv()
# Load API Keys
GENIUS_API_TOKEN = os.getenv("GENIUS_API_TOKEN")
LASTFM_API_KEY = os.getenv("LAST_FM_API_KEY")
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")

# GENIUS Lyrics Init
genius = lyricsgenius.Genius(GENIUS_API_TOKEN, skip_non_songs=True, remove_section_headers=True)

# MusicBrainz MetaData Init
musicbrainzngs.set_useragent("music-study", "0.1", "email@email.com")

# spotify
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_SECRET))

Building the records for each data point. Each row will contain info about the song, like the name, artist, playcount, listeners, and other features. We are looking to have some columns that are the end point (y) and a lot of columns that lead to the descisions (features). 


Ok so there are going to be songs in musicBrainz that are not there in AcousticBrainz. This will likely introduce null values. In the automation process, when we are zipping together data to create our dataset, i guess its inevitable to have null values happen in front of us. To be dealt with later (cleeeaaaaningggg).

There is some temporal bias involved here, in terms of the age of the song which accumulates playcount and listeners, potentially, which could tamper with the idea of the populairty here. The playcounts and listener numbers should be normalized or averaged per year (even thats not enough, considering how mediums of listening to song have changed form as well -> this is a question for another day, but lets keep in mind that we are dealing with temporal bias here. 

Ok going a different direction -> using One Million Song database idk mann

In [3]:
csv_name = "acousticbrainz_sample_1.csv"

In [6]:
import os
import json
import pandas as pd

# Folder with the extracted JSONs
# DATA_DIR = "/Users/ruchipatil/acousticbrainz_dumps/acousticbrainz-highlevel-json-20220623"  # ← update this if your path is different

def parse_highlevel_features(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    hl = data.get('highlevel', {})
    return {
        "mbid": os.path.basename(json_path).replace('.json', ''),
        "danceability": hl.get('danceability', {}).get('value'),
        "mood": hl.get('mood_acoustic', {}).get('value'),
        "genre": hl.get('genre_dortmund', {}).get('value'),
        "voice_instrumental": hl.get('voice_instrumental', {}).get('value'),
        "gender": hl.get('gender', {}).get('value'),
        "timbre": hl.get('timbre', {}).get('value'),
        "tonality": hl.get('tonal_atonal', {}).get('value'),
    }

# Grab the first 10k files for now
all_files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith(".json")]
# sample_files = all_files[:10000]

# records = []
all_files = []
for root, dirs, files in os.walk(DATA_DIR):
    for f in files:
        if f.endswith(".json"):
            all_files.append(os.path.join(root, f))
records = []

for path in all_files[:999999]:
    try: 
        record = parse_highlevel_features(path)
        records.append(record)
    except Exception as e:
        print(f"Error with parsing record at path {path}: {e}")

df = pd.DataFrame(records)

# Save it for future use
df.to_csv(csv_name, index=False)
print(f"Parsed {len(df)} tracks and saved to {csv_name}")


✅ Parsed 999999 tracks and saved to acousticbrainz_sample.csv


In [4]:
DATA_DIR = "/Users/ruchipatil/acousticbrainz_dumps/acousticbrainz-highlevel-json-20220623/highlevel"

In [3]:
df = pd.read_csv("acousticbrainz_sample_1.csv")
print(df.shape)
print(df.head(2))
print(df.columns)

(999999, 8)
                                     mbid   danceability          mood  \
0  619f63d9-9303-431b-b413-1681b49ae1f7-0  not_danceable  not_acoustic   
1  619f60fa-b680-4735-a635-fc0f03715227-0      danceable  not_acoustic   

        genre voice_instrumental  gender  timbre tonality  
0  electronic              voice    male  bright   atonal  
1  electronic       instrumental  female  bright   atonal  
Index(['mbid', 'danceability', 'mood', 'genre', 'voice_instrumental', 'gender',
       'timbre', 'tonality'],
      dtype='object')


ok so that was the starting point, our dataset is defined by what is there in acousticbrainz. no we stitch it with last fm and genius data

so now
1. last FM -> playcount, listeners, tags
2. genius -> lyrics text (tbh this can be a whole nother thing of its own)

In [35]:
acoustic_df = pd.read_csv(csv_name)

In [42]:
def connect_lastfm(row):
    mbid = "-".join(row['mbid'].split("-")[:-1])
    title = row.get("title")
    artist = row.get("artist")
    print(f"acoustic {mbid} --------  {title} by {artist}")
    if pd.isna(mbid)or mbid == "":
        return pd.Series({"playcount": None, "listeners": None, "tags": None})
    url = f"http://ws.audioscrobbler.com/2.0/?method=track.getInfo&mbid={mbid}&api_key={LASTFM_API_KEY}&format=json"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            #we got something back
            print(response.json().get('track', {}))
    except Exception as e: 
        print(f"error retreiving LAST FM info on mbid {mbid} with error: {e}")
        return pd.Series({"playcount": None, "listeners": None, "tags": None})

def get_metadata(mbid):
    mbid = "-".join(mbid.split("-")[:-1])
    try:
        result = musicbrainzngs.get_recording_by_id(mbid, includes=["artists"])
        title = result["recording"]["title"]
        artist = result["recording"]["artist-credit"][0]["artist"]["name"]
        return pd.Series({"title": title, "artist": artist})
    except Exception as e: 
        print(f"Could not retrieve musicBrainz result for {mbid} : {e}")
        return pd.Series({"title": None, "artist": None})
            
              
              

In [44]:
sukhi = get_metadata(acoustic_df.iloc[56576]["mbid"])
connec

title            Sukhi Evvaro
artist    Sanjay Subrahmanyan
dtype: object

In [36]:
mbid = "9ba2ceb8-c6a0-4320-a0be-ed4e6783e810-0"
clean_mbid = "-".join(mbid.split("-")[:-1])
print(clean_mbid)


9ba2ceb8-c6a0-4320-a0be-ed4e6783e810


In [12]:

# Set a user agent
musicbrainzngs.set_useragent("music-trend-project", "0.1", "your_email@example.com")

# Search for a track
result = musicbrainzngs.search_recordings(query="Euphoria", limit=1)
pprint.pprint(result)
track = result['recording-list'][0]

print("🎵 Title:", track['title'])
print("🧑‍🎤 Artist:", track['artist-credit'][0]['name'])
print("🔑 MBID:", track['id'])


{'recording-count': 10408,
 'recording-list': [{'artist-credit': [{'artist': {'alias-list': [{'alias': 'Scott '
                                                                            'Anthony '
                                                                            'Arceneaux '
                                                                            'Jr.',
                                                                   'sort-name': 'Arceneaux, '
                                                                                'Scott '
                                                                                'Anthony '
                                                                                'Jr.',
                                                                   'type': 'Legal '
                                                                           'name'},
                                                                  {'alias': 'Scrim',
               

In [3]:
import requests

mbid = track['id']  # from previous step
url = f"https://acousticbrainz.org/api/v1/{mbid}/high-level"

response = requests.get(url)
if response.status_code == 200:
    data = response.json()
    print("🎛️ Acoustic Features (high-level):")
    print(data["highlevel"])
else:
    print(f"❌ Could not fetch features for {mbid}")


🎛️ Acoustic Features (high-level):
{'danceability': {'all': {'danceable': 0.00907512474805, 'not_danceable': 0.99092489481}, 'probability': 0.99092489481, 'value': 'not_danceable', 'version': {'essentia': '2.1-beta4', 'essentia_build_sha': 'b0b9016bb43cc2dafcda53132c1610db4853c6a1', 'essentia_git_sha': 'v2.1_beta4', 'extractor': 'music 1.0', 'gaia': '2.4.5', 'gaia_git_sha': 'v2.4.4-44-g95f4851', 'models_essentia_git_sha': 'v2.1_beta1'}}, 'gender': {'all': {'female': 0.20377586782, 'male': 0.796224117279}, 'probability': 0.796224117279, 'value': 'male', 'version': {'essentia': '2.1-beta4', 'essentia_build_sha': 'b0b9016bb43cc2dafcda53132c1610db4853c6a1', 'essentia_git_sha': 'v2.1_beta4', 'extractor': 'music 1.0', 'gaia': '2.4.5', 'gaia_git_sha': 'v2.4.4-44-g95f4851', 'models_essentia_git_sha': 'v2.1_beta1'}}, 'genre_dortmund': {'all': {'alternative': 3.45486239617e-09, 'blues': 4.17774437267e-09, 'electronic': 0.999997079372, 'folkcountry': 1.06058018901e-06, 'funksoulrnb': 6.4876736871

MusicBrianz seems to be a metadat databased; they have informations about the songs, artists, release not about the song itself?
MBID -> MusicBrainz ID
UUID (long string of characters) that is uniquely IDing a track, artist, an album,k release grou, etc.

this MBID will be important, as it has been implemented as the foreign key for AcousticBrainz, and they are another database (with other info?)



ok so acoustic brainz i guess goes into more cateogirizing the acoustics of the song... based on probabilities
what is gaia?

are they using human catogozing too? ok anyways this pretty cool 

hopefully they have variety in nationalities and genres

# data set for exploratory analysis