<a href="https://colab.research.google.com/github/ruchithelamp/music/blob/main/Milestone_2_Recession_Music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is designed to import pre-downloaded music API data, clean and concatenate it and limit to just English songs.

###First, this connects to our team Google Drive folder where our data downloads are all stored, combine all of that data into separate dataframes for each data source, then combine the two data sets using fuzzy matching for where the title of the album and artist match.

### Next, the Genius API was used in a local terminal (restrictions for the API's use exist under Colab) to match lyrics to the songs in the discogs/musicbrainz files, and these results were filtered to only include English lyrics.



In [13]:
#mounting Google Drive to this Google Colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#imports and such
import os
import json
import pandas as pd
from itertools import chain

#Connecting to the shared drive folder (NOTE: You must have a copy of this in your My Drive for this to work!!)
team_folder = '/content/drive/My Drive/music_anthro_M2'


##This whole next chunk of code is for collecting and aggregating the data files for each year for musicbrainz and discogs
  #Still need to add genuis and acousticbrainz collections in here once these are added to the music_data_unzipped file!!
data_path = os.path.join(team_folder, 'music_data_unzipped')
years = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]


Mounted at /content/drive


These next two code blocks are designed to aggregate the discogs and musicbrainz data sets into two separate large data frames

In [5]:
#Collecting all Discogs data into one list
all_discogs = []

for year in years:
    discogs_path = os.path.join(data_path, year, 'discogs')
    if os.path.exists(discogs_path):
        for file in os.listdir(discogs_path):
            if file.endswith('.json'):
                with open(os.path.join(discogs_path, file), 'r') as f:
                    data = json.load(f)
                    results = data.get('results', [])
                    for item in results:
                        flat_item = {
                            'discogs_id': item.get('id'),
                            'track_title': item.get('title'),
                            'country': item.get('country'),
                            'release_year': str(item.get('year')),
                            'format': ', '.join(item['format']) if isinstance(item.get('format'), list) else item.get('format'),
                            'label': ', '.join(item['label']) if isinstance(item.get('label'), list) else item.get('label'),
                            'genre': ', '.join(item['genre']) if isinstance(item.get('genre'), list) else item.get('genre'),
                            'style': ', '.join(item['style']) if isinstance(item.get('style'), list) else item.get('style'),
                            'artist': item.get('artist_joined', ''),
                        }
                        all_discogs.append(flat_item)

discogs_df = pd.DataFrame(all_discogs)

#Cleaning strings
discogs_df.dropna(subset=['track_title', 'artist', 'release_year'], inplace=True)
discogs_df['track_title'] = discogs_df['track_title'].str.lower().str.strip()
discogs_df['artist'] = discogs_df['artist'].str.lower().str.strip()
discogs_df['release_year'] = discogs_df['release_year'].astype(str).str.strip()

In [6]:
#Collecting all MusicBrainz data into one list
##note: this takes ~5 minutes to run!
all_mb = []

for year in years:
    mb_path = os.path.join(data_path, year, 'musicbrainz')
    if os.path.exists(mb_path):
        for file in os.listdir(mb_path):
            if file.endswith('.json'):
                with open(os.path.join(mb_path, file), 'r') as f:
                    data = json.load(f)
                    releases = data.get('releases', [])
                    for item in releases:
                        artist_credit = item.get('artist-credit', [])
                        artist_name = artist_credit[0]['name'] if artist_credit else ''
                        flat_item = {
                            'musicbrainz_id': item.get('id'),
                            'track_title': item.get('title'),
                            'artist': artist_name,
                            'release_year': str(item.get('date')),
                            'country': item.get('country'),
                        }
                        all_mb.append(flat_item)

musicbrainz_df = pd.DataFrame(all_mb)

#Cleaning strings
musicbrainz_df.dropna(subset=['track_title', 'artist', 'release_year'], inplace=True)
musicbrainz_df['track_title'] = musicbrainz_df['track_title'].str.lower().str.strip()
musicbrainz_df['artist'] = musicbrainz_df['artist'].str.lower().str.strip()
musicbrainz_df['release_year'] = musicbrainz_df['release_year'].astype(str).str.strip()

In [2]:
##DO NOT NEED TO RUN THIS IF THE "MERGED_FUZZY_MATCHES.CSV" FILE ALREADY EXISTS IN THE GOOGLE DRIVE!
#This takes HOURS to run, so I'm saving the output to google Drive
!pip install rapidfuzz
from rapidfuzz import process, fuzz
from tqdm import tqdm

#Making a combo column to match on: artist + track title + release year
discogs_df['match_key'] = discogs_df['artist'] + " - " + discogs_df['track_title'] + " - " + discogs_df['release_year']
musicbrainz_df['match_key'] = musicbrainz_df['artist'] + " - " + musicbrainz_df['track_title'] + " - " + musicbrainz_df['release_year']

#Dropping duplicate match keys from MusicBrainz so I don't get index errors when I turn it into a dictionary
musicbrainz_df = musicbrainz_df.drop_duplicates(subset=['match_key'])

#Turning MusicBrainz into a lookup dictionary so it's faster to match
mb_lookup = musicbrainz_df.set_index('match_key').to_dict('index')

#Looping through all the Discogs match keys and trying to find a close match in MusicBrainz only keeping matches with at least a 90 similarity
matches = []
threshold = 90

print("Running fuzzy matching on artist + track title + release year...")
for key in tqdm(discogs_df['match_key']):
    match, score, _ = process.extractOne(key, mb_lookup.keys(), scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        matches.append({
            'discogs_match_key': key,
            'musicbrainz_match_key': match,
            'similarity_score': score
        })

#Turning the matches into a DataFrame and merging back in the original data from both sources
matches_df = pd.DataFrame(matches)
print(f"\nFound {len(matches_df)} fuzzy matches with score >= {threshold}")

#Merging in the original data for each match key from both Discogs and MusicBrainz
merged_fuzzy = matches_df.merge(discogs_df, left_on='discogs_match_key', right_on='match_key', suffixes=('', '_discogs'))
merged_fuzzy = merged_fuzzy.merge(musicbrainz_df, left_on='musicbrainz_match_key', right_on='match_key', suffixes=('_discogs', '_musicbrainz'))

#Saving to CSV so I don’t lose the results after this long-ass fuzzy match run time hahaha
merged_fuzzy.to_csv("/content/drive/My Drive/music_anthro_M2/merged_fuzzy_matches.csv", index=False)
print("Saved fuzzy matched results to merged_fuzzy_matches.csv")

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

ModuleNotFoundError: No module named 'rapidfuzz'

In [9]:
#investigating the newly saved fuzzy_matches csv
fuzzy_matches = pd.read_csv('/content/drive/My Drive/music_anthro_M2/merged_fuzzy_matches.csv')
print(f"Shape: {fuzzy_matches.shape}")
print("\nColumns:")
print(fuzzy_matches.columns.tolist())
fuzzy_matches.head()

#Droping rows with missing critical info
cleaned = fuzzy_matches.dropna(subset=['artist_discogs', 'artist_musicbrainz', 'track_title_discogs', 'track_title_musicbrainz'])
cleaned.head()


Shape: (9002, 19)

Columns:
['discogs_match_key', 'musicbrainz_match_key', 'similarity_score', 'discogs_id', 'track_title_discogs', 'country_discogs', 'release_year_discogs', 'format', 'label', 'genre', 'style', 'artist_discogs', 'match_key_discogs', 'musicbrainz_id', 'track_title_musicbrainz', 'artist_musicbrainz', 'release_year_musicbrainz', 'country_musicbrainz', 'match_key_musicbrainz']


Unnamed: 0,discogs_match_key,musicbrainz_match_key,similarity_score,discogs_id,track_title_discogs,country_discogs,release_year_discogs,format,label,genre,style,artist_discogs,match_key_discogs,musicbrainz_id,track_title_musicbrainz,artist_musicbrainz,release_year_musicbrainz,country_musicbrainz,match_key_musicbrainz


In [14]:
csv_path = "/content/drive/My Drive/music_anthro_M2/merged_fuzzy_matches.csv"
df = pd.read_csv(csv_path)
print(df.shape)
df.head()

(9002, 19)


Unnamed: 0,discogs_match_key,musicbrainz_match_key,similarity_score,discogs_id,track_title_discogs,country_discogs,release_year_discogs,format,label,genre,style,artist_discogs,match_key_discogs,musicbrainz_id,track_title_musicbrainz,artist_musicbrainz,release_year_musicbrainz,country_musicbrainz,match_key_musicbrainz
0,- death cigarettes - bleed you dry - 2007,death cigarettes - bleed you dry - 2007,97.5,2949876,death cigarettes - bleed you dry,UK,2007,"CDr, Single","20Nothing, Copyright Control",Electronic,Electro,,- death cigarettes - bleed you dry - 2007,e7a8c508-8a87-4bf5-84f4-d9d322d47d99,bleed you dry,death cigarettes,2007,GB,death cigarettes - bleed you dry - 2007
1,- messiah j & the expert - place your bets - ...,messiah j & the expert - place your bets - 2007,97.916667,2096679,messiah j & the expert - place your bets,USA & Canada,2007,"File, AAC, Single",Inaudible Records,Hip Hop,,,- messiah j & the expert - place your bets - ...,14039f3a-eac3-44de-8756-d8456a075e93,place your bets,messiah j & the expert,2007,US,messiah j & the expert - place your bets - 2007
2,- arbon - il pleut au paradis - 2007,arbon - il pleut au paradis - 2007,97.142857,1937992,arbon - il pleut au paradis,France,2007,"CD, Album",P&PP,Pop,Chanson,,- arbon - il pleut au paradis - 2007,160af93b-37f5-493a-a25e-50665f589609,il pleut au paradis,arbon,2007,FR,arbon - il pleut au paradis - 2007
3,- mr.gabber - krutovláda nad ostrovem bez pom...,mr.gabber - krutovláda nad ostrovem bez pomoci...,92.982456,1147132,mr.gabber - krutovláda nad ostrovem bez pomoci,Czech Republic,2007,"File, MP3",Krabator-Bass-Systems Records,Electronic,"Rhythmic Noise, Speedcore",,- mr.gabber - krutovláda nad ostrovem bez pom...,043d400c-8153-483e-98fd-99b7b4984c20,krutovláda nad ostrovem bez pomoci,mr.gabber,2007-11-24,CZ,mr.gabber - krutovláda nad ostrovem bez pomoci...
4,- sieghai - kamikatze - 2007,sieghai - kamikatze - 2007,96.296296,1082955,sieghai - kamikatze,Netherlands,2007,"File, MP3",Dadaist Audio,Electronic,"Breakbeat, Techno, Acid, Electro",,- sieghai - kamikatze - 2007,7ee37d3a-b1a7-44a4-8ba3-1252e455279e,kamikatze,sieghai,2007,DE,sieghai - kamikatze - 2007


In [15]:
##Ran the Genius lyrics API locally on my computer next because it was blocked on Colab
#Loading in the updated Discogs and MusicBrianz data with lyrics file and filtering to include only english lyrics


!pip install langdetect
import pandas as pd

#Load file in from Google Drive
file_path = "/content/drive/My Drive/music_anthro_M2/merged_fuzzy_matches_with_lyrics.csv"
df = pd.read_csv(file_path)
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm

DetectorFactory.seed = 0

#Creating a column for language
langs = []
for text in tqdm(df['lyrics'].fillna('')):
    try:
        langs.append(detect(text))
    except LangDetectException:
        langs.append("unknown")

df['detected_language'] = langs
#Keeping only English lyrics
english_df = df[df['detected_language'] == 'en'].copy()
#Checking results
number_of_english_lyrics = len(english_df)
print(f"Number of English lyrics: {number_of_english_lyrics}")

english_df.to_csv("/content/drive/My Drive/music_anthro_M2/Discogs_and_musicbrainz_english_lyrics_only.csv", index=False)





100%|██████████| 9002/9002 [00:11<00:00, 786.90it/s]


Number of English lyrics: 2024


###Feature extraction/setiment analysis using Vader

In [20]:
!pip install vadersentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Initializing the lyric sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

#Applying sentiment scoring
sentiment_scores = english_df['lyrics'].fillna('').apply(analyzer.polarity_scores)

#Adding compound scores to a new column
english_df['vader_compound'] = sentiment_scores.apply(lambda x: x['compound'])
english_df['vader_pos'] = sentiment_scores.apply(lambda x: x['pos'])
english_df['vader_neu'] = sentiment_scores.apply(lambda x: x['neu'])
english_df['vader_neg'] = sentiment_scores.apply(lambda x: x['neg'])

#Save with sentiment scores added
english_df.to_csv("/content/drive/My Drive/music_anthro_M2/english_lyrics_with_sentiment.csv", index=False)


Sentiment scores added and saved to: english_lyrics_with_sentiment.csv


Feature Extraction/Topic Modeling using LDA

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

#Vectorizing the lyrics column for LDA
vectorizer = CountVectorizer(
    max_df=0.85,#Ignore words that appear in more than 85% of songs
    min_df=1,#Ignore words that appear in fewer than 1 song
    stop_words='english'
)
X = vectorizer.fit_transform(english_df['lyrics'].fillna(''))

#Fitting the LDA model
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_matrix = lda.fit_transform(X)

#Assigning each song to the most dominant topic
english_df['dominant_topic'] = np.argmax(lda_matrix, axis=1)

# Saving the topic distribution per document for multi-topic analysis
topic_df = pd.DataFrame(lda_matrix, columns=[f"topic_{i}" for i in range(n_topics)])
english_df = pd.concat([english_df.reset_index(drop=True), topic_df.reset_index(drop=True)], axis=1)

#Saving the output to Google Drive
english_df.to_csv("/content/drive/My Drive/music_anthro_M2/Discogs_musicbrainz_with_topics.csv", index=False)


Topic modeling complete. Added dominant topic and topic proportions to dataset.
