# Setup

In [None]:
# Setup environment
!pip install lyricsgenius

Collecting lyricsgenius
  Downloading lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
[?25l[K     |█████▌                          | 10 kB 18.5 MB/s eta 0:00:01[K     |███████████                     | 20 kB 8.6 MB/s eta 0:00:01[K     |████████████████▌               | 30 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████          | 40 kB 5.1 MB/s eta 0:00:01[K     |███████████████████████████▋    | 51 kB 4.9 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 2.1 MB/s 
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-3.0.1


In [None]:
import lyricsgenius
import requests
import json
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
api_key = ''

In [None]:
genius = lyricsgenius.Genius(api_key)
genius.excluded_terms = ["(Remix)", "(Live)"]

In [None]:
artist_names = ('Future', 'Freddie Gibbs', 'Kendrick Lamar')

In [None]:
def load_artists(file_map):
    artist_dict = dict()
    for artist_name, filename in file_map.items():
        with open(filename, 'r') as f:
            artist_dict[artist_name] = json.load(f)

# Collection

In [None]:
for artist_name in artist_names:
    # Get artist object with all related song objects
    artist = genius.search_artist(artist_name)

    # Save the lyrics
    artist.save_lyrics()

# Cleaning

In [None]:
artist_name_to_file_map = {
    artist_name: f'Lyrics_{artist_name.replace(' ', '')}.json'
    for artist_name in artist_names
}

In [2]:
artist_dict = load_artists(artist_name_to_file_map)

## Setup

In [90]:
def clean_lyrics(song):
    artist = song['artist']
    title = song['title']
    lyrics = song['lyrics']
    artist_verse = False
    
    if re.findall('\[.*\]', lyrics):
        if not song['featured_artists']:
            artist_verse = True

        lyrics = '\n'.join([
            re.sub('[0-9]*Embed', '', chunk).replace(f'{title} Lyrics', '')
            for tag, chunk in zip(
                ['', *re.findall('\[.*\]', lyrics)],
                re.split('\[.*\]', lyrics)
            )
            if artist in tag or (artist_verse and tag)
        ])
        lines = lyrics.split('\n')
    else:
        lines = re.sub('[0-9]*Embed', '', lyrics).replace(f'{title} Lyrics', '').split('\n')
    
    lines = [line for line in lines if line]
    df = pd.DataFrame(lines, columns=['lyric'])
    if not df.empty:
        df.loc[:, 'artist'] = artist
        df.loc[:, 'song'] = title
    
    return df

## Transforming to DataFrame

In [91]:
lyric_df = pd.DataFrame(columns=['artist', 'song', 'lyric'])
for artist in artist_dict.values():
    for song in artist['songs']:
        lyric_df = pd.concat([lyric_df, clean_lyrics(song)]).reset_index(drop=True)

## Split train and test data

In [None]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

In [None]:
for artist_name in artist_dict.keys():
    artist_train, artist_test = train_test_split(lyric_df[lyric_df.artist == artist_name], test_size=0.2)
    train_df = pd.concat([train_df, artist_train]).reset_index(drop=True)
    test_df = pd.concat([test_df, artist_test]).reset_index(drop=True)

In [None]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')