Connect to Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Install all libraries needed for the **Create Lyrics-Metadata Dataset**. The usage of the downloaded libraries is as follows:

1. `pyphen` - split words into syllables.
2. `pronouncing` - retrieves the phonetic spelling of words as per the CMU Pronouncing Dictionary.
3. `langdetect` - detect language of lyrics.
4. `musicbraings` - originally intended for retrieving song length, but the API malfunctioned at the time. `nested-lookup` was needed to search for the `length` key in the JSON response.
5. `spotipy` - call the Spotify API to retrieve song metadata.


In [None]:
!pip install pyphen
!pip install pronouncing
!pip install langdetect
!pip install musicbrainzngs
!pip install nested-lookup
!pip install spotipy

In [None]:
import os, glob, requests, uuid, json, re, random, itertools, math
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns

import pyphen
import musicbrainzngs as mb
from nested_lookup import nested_lookup
from langdetect import detect
mb.set_useragent('app', 1)
import spotipy as spotify
from spotipy.oauth2 import SpotifyClientCredentials
spotify = spotify.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id='your client id', 
                                                                              client_secret='your client secret'))

# Processing

## Load Original Data

In [None]:
df_train = pd.read_csv("/content/drive/My Drive/Lyrics-Genre-Train.csv", 
                       delimiter=',')
df_test = pd.read_csv("/content/drive/My Drive/Lyrics-Genre-Test-GroundTruth.csv", 
                      delimiter=',')

df_train.drop(['Song year', 'Track_id'], axis=1, inplace=True)
df_test.drop(['Song year', 'Track_id', 'Song', 'Artist'], axis=1, inplace=True)

df_train['Lyrics'] = df_train['Lyrics'].astype(str)
df_test['Lyrics'] = df_test['Lyrics'].astype(str)

df_train['Lyrics'] = df_train['Lyrics'].map(lambda x: ' '.join(x.split('\n')))
df_test['Lyrics'] = df_test['Lyrics'].map(lambda x: ' '.join(x.split('\n')))

df_train['Song'] = df_train['Song'].map(lambda x: ' '.join(x.split('-')))
df_train['Artist'] = df_train['Artist'].map(lambda x: ' '.join(x.split('-')))

print(df_train.shape, df_test.shape)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(pd.DataFrame(data=[df_train['Genre'].value_counts(), 
                             df_test['Genre'].value_counts()],
                    columns=['Rock', 'Pop', 'Hip-Hop', 'Country', 'Metal', 
                             'Jazz', 'Electronic', 'R&B', 'Indie', 'Folk'], 
                    index=['Train Data Distribution', 'Test Data Distribution']))

## Enrich Data

We used several kaggle lyrics datasets in order to augment our original dataset. The datasets in question are [150K Lyrics Labeled with Spotify Valence](https://www.kaggle.com/edenbd/150k-lyrics-labeled-with-spotify-valence), [dataset lyrics musics](https://www.kaggle.com/italomarcelo/dataset-lyrics-musics) and [AZLyrics song lyrics](https://www.kaggle.com/albertsuarez/azlyrics). Another dataset which would've been interesting to integrate into the project is [Million Song Dataset](https://www.kaggle.com/c/msdchallenge/data), but I figured I wouldn't physically have the time to restructure, label and merge this dataset with the original data and the two previously mentioned datasets.

The kaggle datasets, however, do not come in an useful format for us. Namely, they don't have a `Genre` column. The reasons why we chose these three datasets is:

1. They each number over 150.000 samples, enough to train BERT model properly.
2. They contain `Artist` and `Song` features, which will help us filter out duplicates when we merge the 3 datasets. 

In order to deal with the lack of `Genre` labelling, we have built our own labelling function using the `spotipy` library, which uses the **Spotify API** in order to retrieve the genre of an `Artist`.

Please note that the Spotify API returns a list of genres for one artist, so we consider the most common genre to be said artists dominant genre.

Aditionally, the AZLyrics data was badly encoded, namely the column delimiter character, the comma, was also used as a verse delimiter in the `Lyrics` column. Fortunately, the dataset comes with two URL columns that conveniently separate the `Artist`, `Song` and `Lyrics` columns, so with a bit of regex magic we were able to extract the useful data using `https://` as a delimiter.

On a last note, I used Nakatani Shuyo's [langdetect](https://pypi.org/project/langdetect/) library to automatically label the lyrics with a language.

#### The dataset resulting from merging the original data with the three scraped datasets has over 290.000 unique train samples.


In [None]:
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/dataset-lyrics-musics.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')
with zipfile.ZipFile('/content/drive/MyDrive/labeled_lyrics_cleaned.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')
with zipfile.ZipFile('/content/drive/MyDrive/archive.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
def determine_genre(row):
    artist_name = ' '.join(row['Artist'].split('-'))
    possible_genres = {'Rock': 0, 'Metal': 0, 'Folk': 0, 'Jazz': 0, 'Indie': 0, 
                       'Pop': 0, 'Hip-Hop': 0, 'R&B':0, 'Electronic': 0}

    results = spotify.search(q='artist:' + artist_name, type='artist')

    if len(results['artists']['items']) > 0:
        genres = [genre.lower() for genre in results['artists']['items'][0]['genres']]
        for genre in results['artists']['items'][0]['genres']:
            for key in possible_genres.keys():
                if key.lower() in genre:
                    possible_genres[key] += 1

        if all(value == 0 for value in possible_genres.values()):
            return 'NaN'
        else:
            return max(possible_genres, key=possible_genres.get)
    else:
        return 'NaN'

def extract_song(elem):
    return re.search(',"(.*)",', elem)[0][2:-2]

def extract_lyrics(elem):
    return re.search('.html","(.*)', elem)[0][8:-1]

def restructure_azlyrics_data():
    df3 = pd.DataFrame(columns=["Artist", "Song", "Lyrics"])

    for name in glob.glob('/content/azlyrics-scraper/*'):
        df_temp = pd.read_csv(name, delimiter='https://', error_bad_lines=False, quoting=3)
        artists = [elem[0][1:-3] for elem in df_temp['"ARTIST_NAME","ARTIST_URL","SONG_NAME","SONG_URL","LYRICS"'].index.to_numpy()]    
        songs = [extract_song(elem[1]) for elem in df_temp['"ARTIST_NAME","ARTIST_URL","SONG_NAME","SONG_URL","LYRICS"'].index.to_numpy()]
        lyrics = [extract_lyrics(elem) for elem in df_temp['"ARTIST_NAME","ARTIST_URL","SONG_NAME","SONG_URL","LYRICS"'].to_numpy()]
        lyrics = ['\n'.join(lyric.split(', ')) for lyric in lyrics]
        dict_temp = {'Artist': artists, 'Song': songs, 'Lyrics': lyrics}
        df_temp = pd.DataFrame(dict_temp)
        df3 = pd.concat([df3, df_temp], axis=0)
        
    return df3

#### Read and Prepare Data

In [None]:
df1 = pd.read_csv('/content/dataset-lyrics-musics.csv', header=0, names=['ArtistID', 'Artist', 'Song', 'Lyrics'])
df2 = pd.read_csv('/content/labeled_lyrics_cleaned.csv', header=0, names=['ArtistID', 'Artist', 'Lyrics', 'Song', 'Label'])
df3 = read_and_restructure_azlyrics_data()
df1.drop(['ArtistID'], axis=1, inplace=True)
df2.drop(['ArtistID', 'Label'], axis=1, inplace=True)
print(df1.shape, df2.shape, df3.shape)

In [None]:
df1['Genre'] = df1.apply(lambda x: determine_genre(x), axis=1)
df1.drop(df1[df1['Genre'] == 'NaN'].index, axis=0, inplace=True)
df1.to_csv('/content/drive/MyDrive/df1.csv', index=False)

df2['Genre'] = df2.apply(lambda x: determine_genre(x), axis=1)
df2.drop(df1[df1['Genre'] == 'NaN'].index, axis=0, inplace=True)
df2.to_csv('/content/drive/MyDrive/df2.csv', index=False)

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/df1.csv', header=0)
df2 = pd.read_csv('/content/drive/MyDrive/df2.csv', header=0)
df3 = pd.read_csv('/content/drive/MyDrive/df3.csv', header=0)

df1['Lyrics'] = df1['Lyrics'].astype(str)
df1['Song'] = df1['Song'].map(lambda x: x.lower())
df1['Artist'] = df1['Artist'].map(lambda x: ' '.join(x.split('-')))
df1['Lyrics'] = df1['Lyrics'].map(lambda x: x.replace('. ', '\n'))

df2['Lyrics'] = df2['Lyrics'].astype(str)
df2['Song'] = df2['Song'].map(lambda x: x.lower())
df2['Artist'] = df2['Artist'].map(lambda x: x.lower())
df2['Lyrics'] = df2['Lyrics'].map(lambda x: x.replace('\r', ''))

df3['Lyrics'] = df3['Lyrics'].astype(str)
df3['Song'] = df3['Song'].map(lambda x: x.lower())
df3['Artist'] = df3['Artist'].map(lambda x: x.lower())
df3['Lyrics'] = df3['Lyrics'].map(lambda x: x.replace('\r', ''))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(pd.DataFrame(data=[df1['Genre'].value_counts(), 
                             df2['Genre'].value_counts(), 
                             df3['Genre'].value_counts()],
                    columns=['Rock', 'Pop', 'Hip-Hop', 'Country', 'Metal', 
                             'Jazz', 'Electronic', 'R&B', 'Indie', 'Folk'], 
                    index=['Dataset 1 Distribution', 'Dataset 2 Distribution', 'Dataset 3 Distribution']))

#### Merge datasets

In [None]:
df_dict = {'Artist': [], 'Song': [], 'Genre': [], 'Lyrics': []}

for row in df1.iterrows():
    if row[1]['Artist'] not in df2['Artist'] and row[1]['Song'] not in df2['Song'] and \
        row[1]['Artist'] not in df3['Artist'] and row[1]['Song'] not in df3['Song'] and \
        row[1]['Artist'] not in df_train['Artist'] and row[1]['Song'] not in df_train['Song']:
        df_dict['Genre'].append(row[1]['Genre'])
        df_dict['Lyrics'].append(row[1]['Lyrics'])
        df_dict['Artist'].append(row[1]['Artist'])
        df_dict['Song'].append(row[1]['Song'])

for row in df2.iterrows():
    if row[1]['Artist'] not in df1['Artist'] and row[1]['Song'] not in df1['Song'] and \
        row[1]['Artist'] not in df2['Artist'] and row[1]['Song'] not in df3['Song'] and \
        row[1]['Artist'] not in df_train['Artist'] and row[1]['Song'] not in df_train['Song']:
        df_dict['Genre'].append(row[1]['Genre'])
        df_dict['Lyrics'].append(row[1]['Lyrics'])
        df_dict['Artist'].append(row[1]['Artist'])
        df_dict['Song'].append(row[1]['Song'])

for row in df3.iterrows():
    if row[1]['Artist'] not in df1['Artist'] and row[1]['Song'] not in df1['Song'] and \
        row[1]['Artist'] not in df2['Artist'] and row[1]['Song'] not in df2['Song'] and \
        row[1]['Artist'] not in df_train['Artist'] and row[1]['Song'] not in df_train['Song']:
        df_dict['Genre'].append(row[1]['Genre'])
        df_dict['Lyrics'].append(row[1]['Lyrics'])
        df_dict['Artist'].append(row[1]['Artist'])
        df_dict['Song'].append(row[1]['Song'])

for row in df_train.iterrows():
    if row[1]['Artist'] not in df1['Artist'] and row[1]['Song'] not in df1['Song'] and \
        row[1]['Artist'] not in df2['Artist'] and row[1]['Song'] not in df2['Song'] and \
        row[1]['Artist'] not in df2['Artist'] and row[1]['Song'] not in df3['Song']:
        df_dict['Genre'].append(row[1]['Genre'])
        df_dict['Lyrics'].append(row[1]['Lyrics'])
        df_dict['Artist'].append(row[1]['Artist'])
        df_dict['Song'].append(row[1]['Song'])

df_train = pd.DataFrame(df_dict)

#### Add Language Feature

In [None]:
def add_language(df):
    lang = []

    for row in df.iterrows():
        try:
            lyrics = ' '.join(row[1]['Lyrics'].lower().split('\n'))
            lyrics = re.sub(r'[^\w\s]', '', lyrics) 
            lang.append(detect(' '.join(lyrics.split()[:500])))
        except:
            lang.append('nan')

    df['Language'] = lang

    return df

df_train = add_language(df_train)
newcols = ['Artist', 'Song', 'Genre', 'Language', 'Lyrics']
df_train_new = df_train[newcols]
df_train_new.to_csv('/content/drive/MyDrive/train.csv', index=False)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(pd.DataFrame(data=[df_train['Genre'].value_counts()],
                       columns=['Rock', 'Pop', 'Hip-Hop', 'Country', 'Metal', 
                                'Jazz', 'Electronic', 'R&B', 'Indie', 'Folk'], 
                       index=['Train Data Distribution']))