In [1]:
import pandas as pd
import requests
import time
from sklearn.preprocessing import LabelEncoder

# Load datasets
moody_lyrics_path = 'datasets/MoodyLyrics4Q.csv'
allmusic_lyrics_path = 'datasets/Dataset-AllMusic-771Lyrics.csv'

moody_lyrics = pd.read_csv(moody_lyrics_path)
allmusic_lyrics = pd.read_csv(allmusic_lyrics_path, encoding='latin1')

# Remove index columns
if 'index' in moody_lyrics.columns:
    moody_lyrics.drop(columns=['index'], inplace=True)
if 'Name' in allmusic_lyrics.columns:
    allmusic_lyrics.drop(columns=['Name'], inplace=True)
print("Index columns removed from datasets.")

# Replace "_" with " " in Dataset-AllMusic-771 and classification replacements
allmusic_lyrics.replace('_', ' ', regex=True, inplace=True)
allmusic_lyrics.replace('  ', ' ', regex=True, inplace=True)
classification_replacements = {'Q1': 'happy', 'Q2': 'angry', 'Q3': 'sad', 'Q4': 'relaxed'}
allmusic_lyrics['Classification'].replace(classification_replacements, inplace=True)
print(f"Classification replacements done: Q1 = happy, Q2 = angry, Q3 = sad, Q4 = relaxed.")

# Standardize column names
moody_lyrics.rename(columns={'title': 'title', 'artist': 'artist', 'mood': 'mood'}, inplace=True)
allmusic_lyrics.rename(columns={'Title': 'title', 'Artist': 'artist', 'Classification': 'mood'}, inplace=True)
print(f"Column names were standardized.")

# Merge datasets vertically
merged_dataset = pd.concat([moody_lyrics, allmusic_lyrics], ignore_index=True)
print("Datasets merged into a unique dataset.")

# Remove rows with NaN values
initial_row_count = merged_dataset.shape[0]
merged_dataset.dropna(inplace=True)
nan_removed_count = initial_row_count - merged_dataset.shape[0]
print(f"Removed {nan_removed_count} rows containing NaN values.")

# Remove duplicate rows
initial_row_count = merged_dataset.shape[0]
merged_dataset.drop_duplicates(inplace=True)
duplicates_removed_count = initial_row_count - merged_dataset.shape[0]
print(f"Removed {duplicates_removed_count} duplicate rows.")

# Keep original categorical data and convert to numerical
merged_dataset['original_mood'] = merged_dataset['mood']
le = LabelEncoder()
merged_dataset['mood'] = le.fit_transform(merged_dataset['mood'])
print("Categorical data converted to numerical.")

# Function to get lyrics from Musixmatch
def get_lyrics(song_name, artist_name, api_keys):
    base_url = "https://api.musixmatch.com/ws/1.1/"
    
    for api_key in api_keys:
        # Endpoint to search for the track
        search_url = f"{base_url}track.search"
        search_params = {
            'q_track': song_name,
            'q_artist': artist_name,
            'apikey': api_key,
            'f_has_lyrics': 1
        }
        
        # Making the request to search for the track
        response = requests.get(search_url, params=search_params)
        data = response.json()
        
        if data['message']['header']['status_code'] != 200:
            continue
        
        track_list = data['message']['body']['track_list']
        if not track_list:
            continue
        
        # Getting the track ID of the first result
        track_id = track_list[0]['track']['track_id']
        
        # Endpoint to get lyrics
        lyrics_url = f"{base_url}track.lyrics.get"
        lyrics_params = {
            'track_id': track_id,
            'apikey': api_key
        }
        
        # Making the request to get the lyrics
        response = requests.get(lyrics_url, params=lyrics_params)
        data = response.json()
        
        if data['message']['header']['status_code'] != 200:
            continue
        
        lyrics = data['message']['body']['lyrics']['lyrics_body']
        return lyrics
    
    return None

# Fetching lyrics for each song
api_key = ['1611504a22c9c19cbe8b560d42d4a896','b76a1b9a47064853d3363d46e5352e79']
missing_lyrics_count = 0

def fetch_lyrics(row):
    global missing_lyrics_count
    lyrics = get_lyrics(row['title'], row['artist'], api_key)
    if lyrics is None:
        print(f"No lyrics found for '{row['title']}' by {row['artist']}")
        missing_lyrics_count += 1
    return lyrics

merged_dataset['lyrics'] = merged_dataset.apply(fetch_lyrics, axis=1)
initial_row_count = merged_dataset.shape[0]
print("Lyrics fetched and added to the dataset.")
print(f"Total songs in dataset: {initial_row_count}")
print(f"Total songs without lyrics: {missing_lyrics_count}")

# Remove rows without lyrics
initial_row_count = merged_dataset.shape[0]
merged_dataset = merged_dataset[merged_dataset['lyrics'].notna()]
lyrics_removed_count = initial_row_count - merged_dataset.shape[0]
print(f"Removed {lyrics_removed_count} rows without lyrics.")
print(f"Total songs in dataset: {merged_dataset.shape[0]}")

# Data Cleaning: Handle missing values, if any
merged_dataset.dropna(subset=['lyrics', 'mood'], inplace=True)

# Display the first few rows of the merged dataset
display(merged_dataset.head())

# Save the cleaned dataset with lyrics
merged_dataset.to_csv('cleaned_dataset_with_lyrics.csv', index=False)
print("Cleaned dataset with lyrics saved to 'cleaned_dataset_with_lyrics.csv'.")

Index columns removed from datasets.
Classification replacements done: Q1 = happy, Q2 = angry, Q3 = sad, Q4 = relaxed.
Column names were standardized.
Datasets merged into a unique dataset.
Removed 0 rows containing NaN values.
Removed 197 duplicate rows.
Categorical data converted to numerical.
No lyrics found for '12' by Katatonia
No lyrics found for 'Jungle Drums' by Emiliana Torrini
No lyrics found for 'Two Of Those Too (Album Version)' by Maria Taylor
No lyrics found for 'Everybody Knows That You Are Insane' by Queens Of The Stone Age
No lyrics found for 'Blue Ridge Mountains' by Haley Bonar
No lyrics found for 'Sigma' by Secret Garden
No lyrics found for 'Thick As Thieves (2005 Version)' by Natalie Merchant
No lyrics found for 'Insomnia (Revised Album Version)' by Tyler Hilton
No lyrics found for 'Love Thing' by Spice Girls
No lyrics found for 'Say Yeah' by Acoustic Alchemy
No lyrics found for 'Dates From Hell' by Remi Nicole
No lyrics found for 'Bette Davis Eyes (Acoustic Versio

Unnamed: 0,artist,title,mood,original_mood,lyrics
0,George Michael,I Want Your Sex,1,happy,There's things that you guess\nAnd things that...
1,Rob Zombie,Pussy Liquor,0,angry,Earl had a baby\nBaby was her name\nHe knew sh...
3,Bing Crosby,Swinging On A Star,1,happy,Would you like to swing on a star\nCarry moonb...
4,Ludacris,Get Back,0,angry,"Heads up (woop, woop)\nHeads up (woop, woop)\n..."
5,Hole,Violet,0,angry,And the sky was made of amethyst\nAnd all the ...


Cleaned dataset with lyrics saved to 'cleaned_dataset_with_lyrics.csv'.
