# Lyrics Using Genius API

In [83]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#token provided by Genius API
%store -r client_access_token

In [3]:
#initiate Genius
genius = genius.Genius(client_access_token)
genius.verbose = False #turn off status messages

### Functions

In [4]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    track = re.sub(' - .+','',track) #remove text after '-'
    track = re.sub(' \(.*\)','',track) #remove text within parentheses
    track = re.sub(' \[.*\]','',track) #remove text within brackets
    
    try:
        return genius.search_song(track,artist).lyrics
    except:
        print(track + ' by ' + artist + ' is not available')

In [5]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [61]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace
    
    return df

In [115]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''

    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row: [w for w in row if w not in stop_words]) #remove stopwords

    lemmatizer = WordNetLemmatizer()
    
    def lemmatize_text(text):
        '''
        function returns lemmatized text
        parameters:
            text-->str
        '''
        return [lemmatizer.lemmatize(w) for w in text]
    
    df[col] = df[col].apply(lemmatize_text) #lemmatize words

    return df

### Import Songs to Analyze

Read in the resulting dataframes from the spotify_analysis notebook, which were created as follows:

 - Started with the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts (referred to as the "seed tracks")
 - Used Spotify's recommender algorithm to find the most similar songs to the seed tracks (returns a maximum of 100 songs per search)
 - Ranked the most similar songs by audio features using Euclidean distance
 - Fed the top ranking songs through Spotify's recommender algorithm until there were at least 1,000 songs per genre

In [8]:
#country
df_cty = pd.read_csv('data/df_cty.csv')
df_cty.drop(columns='Unnamed: 0',inplace=True)

In [9]:
#R&B/hip-hop
df_rb = pd.read_csv('data/df_rb.csv')
df_rb.drop(columns='Unnamed: 0',inplace=True)

In [10]:
#rock/alternative
df_rock = pd.read_csv('data/df_rock.csv')
df_rock.drop(columns='Unnamed: 0',inplace=True)

### Obtain Lyrics from Genius API

Pull lyrics from the Genius API with the lyricsgenius wrapper and put into dataframes.

In [12]:
#country
df_cty_lyrics = get_df_songs(df_cty['track'],df_cty['artist'])

One Beer by HIXTAPE is not available
Ballin' by Rvshvd is not available
Gabrielle by Brett Eldredge is not available
Forever Begins Tonight by The McClymonts is not available
Tupelo Honey by JJ Grey & Mofro is not available
#REDNEK by Gord Bamford is not available


In [13]:
#R&B/hip-hop
df_rb_lyrics = get_df_songs(df_rb['track'],df_rb['artist'])

Bang Bang by Jessie J is not available
Before He Cheats by Carrie Underwood is not available
See What I've Become by Zack Hemsey is not available
Stinger by RL Grime is not available
Money On The Table by Belly is not available


In [14]:
#rock
df_rock_lyrics = get_df_songs(df_rock['track'],df_rock['artist'])

Greatest Hits Megamix by The Saturdays is not available
Smells Blood by Kensuke Ushio is not available
Ghost by Machine Girl is not available
Unforgettable by French Montana is not available


### Clean and Preprocess Lyrics

Prepare lyrics for analysis by cleaning and normalizing them.

In [15]:
pd.set_option('mode.chained_assignment', None)

In [116]:
#drop rows without lyrics
df_cty_w_lyrics = df_cty_lyrics.dropna(subset=['lyrics']) #country
df_rb_w_lyrics = df_rb_lyrics.dropna(subset=['lyrics']) #R&B/hip-hop
df_rock_w_lyrics = df_rock_lyrics.dropna(subset=['lyrics']) #rock/alternative

In [117]:
#clean lyrics
#df_cty_cleaned = clean_lyrics(df_cty_w_lyrics,'lyrics','words')
df_rb_cleaned = clean_lyrics(df_rb_w_lyrics,'lyrics','words')
#df_rock_cleaned = clean_lyrics(df_rock_w_lyrics,'lyrics','words')

In [118]:
#normalize 
draft2 = normalize_lyrics(df_rb_cleaned,'words')
#df_cty_w_lyrics = normalize_lyrics(df_cty_w_lyrics,'words')

In [119]:
draft2

Unnamed: 0,track,artist,lyrics,words
0,Leave The Door Open,Bruno Mars,"1. Bruno Mars, Anderson .Paak & Silk Sonic- Le...","[1, bruno, mar, anderson, paak, silk, sonic, l..."
1,Peaches (feat. Daniel Caesar & Giveon),Justin Bieber,[Chorus: Justin Bieber]\nI got my peaches out ...,"[got, peach, georgia, oh, yeah, shit, get, wee..."
2,RAPSTAR,Polo G,[Intro]\n(Shout out my nigga Synco)\n\n[Chorus...,"[shout, nigga, synco, uh, tuned, copped, bmw, ..."
3,Astronaut In The Ocean,Masked Wolf,[Intro]\nAstro-naut\n\n[Chorus]\nWhat you know...,"[astronaut, know, rollin, deep, brain, go, num..."
4,Up,Cardi B,"[Intro]\nUp, up, up (Ayy), up (Uh), up, look (...","[ayy, uh, look, fire, upon, time, man, heard, ..."
...,...,...,...,...
1009,Play With Fire,Nico Santos,[Verse 1]\nIt's only been about a couple of da...,"[couple, day, must, gone, put, spell, get, vei..."
1010,WOW (feat. Sabrina Carpenter) - Remix,Zara Larsson,"[Chorus]\nBaby, I'm not even in a gown\nI'm ju...","[baby, even, gown, tshirt, couch, way, want, m..."
1011,Unfamiliar,Seeb,"[Verse 1: HRVY]\nStood outside, I need to calm...","[stood, outside, need, calm, head, heart, cont..."
1012,501,Various Artists,Eric S. Reed; Soldier's Hymn Music Inc.; Willi...,"[eric, reed, soldier, hymn, music, inc, willie..."


In [196]:
#R&B/hip-hop
#df_rb2 = df_rb_lyrics.dropna(subset=['lyrics'])
df_rb_w_lyrics = df_rb_lyrics.dropna(subset=['lyrics'])
df_rb_w_lyrics = clean_lyrics(df_rb_w_lyrics,'lyrics','words')
df_rb_w_lyrics = normalize_lyrics(df_rb_w_lyrics,'words')

TypeError: 'float' object is not iterable

In [186]:
#df_rb2 = clean_lyrics(df_rb2,'lyrics','words')
#df_rb2 = normalize_lyrics(df_rb2,'words')

KeyError: 'words'

In [184]:
#df_rb2


Unnamed: 0,track,artist,lyrics,word
0,Leave The Door Open,Bruno Mars,"1. Bruno Mars, Anderson .Paak & Silk Sonic- Le...","[n, n]"
1,Peaches (feat. Daniel Caesar & Giveon),Justin Bieber,[Chorus: Justin Bieber]\nI got my peaches out ...,"[n, n]"
2,RAPSTAR,Polo G,[Intro]\n(Shout out my nigga Synco)\n\n[Chorus...,"[n, n]"
3,Astronaut In The Ocean,Masked Wolf,[Intro]\nAstro-naut\n\n[Chorus]\nWhat you know...,"[n, n]"
4,Up,Cardi B,"[Intro]\nUp, up, up (Ayy), up (Uh), up, look (...","[n, n]"
...,...,...,...,...
1009,Play With Fire,Nico Santos,[Verse 1]\nIt's only been about a couple of da...,"[n, n]"
1010,WOW (feat. Sabrina Carpenter) - Remix,Zara Larsson,"[Chorus]\nBaby, I'm not even in a gown\nI'm ju...","[n, n]"
1011,Unfamiliar,Seeb,"[Verse 1: HRVY]\nStood outside, I need to calm...","[n, n]"
1012,501,Various Artists,Eric S. Reed; Soldier's Hymn Music Inc.; Willi...,"[n, n]"


In [139]:
#rock/alternative
#df_rock_w_lyrics = df_rock_lyrics.dropna(subset=['lyrics'])
#df_rock_w_lyrics = clean_lyrics(df_rock_w_lyrics,'lyrics','words')
#df_rock_w_lyrics = normalize_lyrics(df_rock_w_lyrics,'words')

TypeError: 'float' object is not iterable

In [43]:
#df_rock_w_lyrics

Unnamed: 0,track,artist,lyrics,words
0,WITHOUT YOU,The Kid LAROI,[Chorus]\nYou cut out a piece of me and now I ...,"[you, cut, out, a, piece, of, me, and, now, i,..."
1,Your Power,Billie Eilish,[Chorus]\nTry not to abuse your power\nI know ...,"[try, not, to, abuse, your, power, i, know, we..."
2,my ex's best friend (with blackbear),Machine Gun Kelly,[Intro: Machine Gun Kelly & blackbear]\nAyy\nY...,"[ayy, you, know, my, ex, so, that, makes, it, ..."
3,Mood (feat. iann dior),24kGoldn,"[Intro: 24kGoldn]\nOh-oh-oh\nYeah, yeah, yeah,...","[ohohoh, yeah, yeah, yeah, yeah, yeah, why, yo..."
4,Therefore I Am,Billie Eilish,"[Chorus]\nI'm not your friend\nOr anything, da...","[i, not, your, friend, or, anything, damn, you..."
...,...,...,...,...
1070,Umbrella,Rihanna,"[Intro: JAY-Z]\nUh-huh, uh-huh (Yeah, Rihanna)...","[uhhuh, uhhuh, yeah, rihanna, uhhuh, uhhuh, go..."
1071,Girls in the Hood,Megan Thee Stallion,"[Verse 1]\nFuck bein' good, I'm a bad bitch (A...","[fuck, bein, good, i, a, bad, bitch, ah, i, si..."
1072,Partition,Beyoncé,"[Part 1: ""Yoncé""]\n\n[Intro]\nLet me hear you ...","[let, me, hear, you, say, hey, ms, carter, hey..."
1073,Single Ladies (Put a Ring on It),Beyoncé,Sorted by featured singer\n\nRobyn Adele Ande...,"[sorted, by, featured, singer, robyn, adele, a..."


In [197]:
df_rb_lyrics.iloc[1000]

track                                              Medicine
artist                                         James Arthur
lyrics    [Verse 1]\nYou're my bulletproof vest when it'...
words     [you, my, bulletproof, vest, when, it, getting...
Name: 1000, dtype: object

### Join Dataframes

Combine dataframes with audio features and lyrics for each genre.

In [None]:
#country
#df_cty_w_lyrics = 

In [None]:
#R&B/hip-hop
#df_rb_w_lyrics = 

In [None]:
#rock/alternative
#df_rock_w_lyrics = 

### Write Dataframes to File

In [None]:
#country
#df_cty_w_lyrics.to_csv('df_cty_w_lyrics.csv')

In [None]:
#R&B/hip-hop
#df_rb_w_lyrics.to_csv('df_rb_w_lyrics.csv')

In [None]:
#rock/alternative
#df_rock_w_lyrics.to_csv('df_rock_w_lyrics.csv')