# Lyrics Using Genius API

In [1]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#token provided by Genius API
%store -r client_access_token

In [3]:
#initiate Genius
genius = genius.Genius(client_access_token)
genius.verbose = False #turn off status messages

### Functions

In [4]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    
    return genius.search_song(track,artist).lyrics

In [5]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [6]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace

    return df

In [7]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''
    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row:[w for w in row if w not in stop_words]) #remove stopwords
    
    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(w) for w in text]
    
    #lemmatizer = WordNetLemmatizer()
    df[col] = df[col].apply(lemmatize_text)
    #.apply(lambda row:[lemmatizer.lemmatize(w) for w in row]) #lemmatize words
    
    return df

### Import Songs to Analyze

Read in the resulting dataframes from the spotify_analysis notebook, which were created as follows:

 - Started with the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts (referred to as the "seed tracks")
 - Used Spotify's recommender algorithm to find the most similar songs to the seed tracks (returns a maximum of 100 songs per search)
 - Ranked the most similar songs by audio features using Euclidean distance
 - Fed the top ranking songs through Spotify's recommender algorithm until there were at least 1,000 songs per genre

In [23]:
#country
df_cty = pd.read_csv('data/df_cty.csv')
df_cty.drop(columns='Unnamed: 0',inplace=True)

In [24]:
df_cty

Unnamed: 0,track,artist,track_id,release_date,dance,energy,loud,speech,acoust,live,valence,tempo,sim_score
0,Forever After All,Luke Combs,6IBcOGPsniK3Pso1wHIhew,2020-10-23,0.487,0.650,-5.195,0.0253,0.1910,0.0933,0.456,151.964,
1,The Good Ones,Gabby Barrett,3hLuHKzG1cmlRpq53ZVWd8,2020-06-19,0.519,0.552,-5.023,0.0259,0.1800,0.1490,0.331,89.957,
2,Made For You,Jake Owen,7vF3xkCMvZjAe2nTWY0uQZ,2019-03-29,0.581,0.441,-6.829,0.0268,0.7700,0.1110,0.337,82.125,
3,Hell Of A View,Eric Church,1kBx9VGumfuvlfqdlAGorE,2020-10-02,0.689,0.582,-6.778,0.0236,0.1190,0.1010,0.883,99.021,
4,Breaking Up Was Easy In The 90's,Sam Hunt,4sf2L157iEgAR7yrCNLgSq,2020-04-03,0.562,0.649,-5.400,0.0494,0.2310,0.3410,0.376,145.913,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,Hometown,Logan Mize,10MpuvwpKpAJPsbttLjgW1,2021-01-27,0.586,0.801,-5.078,0.0532,0.2480,0.0994,0.718,74.973,-37.832074
1022,Drinkin' Hours,Cole Swindell,2wGFhmzwELvaeXzKQeB7Yb,2019-10-18,0.522,0.922,-3.184,0.0770,0.0552,0.2760,0.878,156.045,-41.336953
1023,Grass Is Always Greener,Jake Owen,1S4zg0dP5HdZkOtqrDk3yg,2019-03-29,0.480,0.851,-3.980,0.0758,0.3760,0.1090,0.881,157.999,-43.245243
1024,Got What I Got,Jason Aldean,4TCc369aRPRubv1m8R1TBG,2019-11-22,0.512,0.580,-6.658,0.0290,0.6210,0.5130,0.303,159.847,-45.061028


In [26]:
#R&B/hip-hop
df_rb = pd.read_csv('data/df_rb.csv')
df_rb.drop(columns='Unnamed: 0',inplace=True)

In [27]:
df_rb

Unnamed: 0,track,artist,track_id,release_date,dance,energy,loud,speech,acoust,live,valence,tempo,sim_score
0,Leave The Door Open,Bruno Mars,7MAibcTli4IisCtbHKrGMh,2021-03-05,0.586,0.616,-7.964,0.0324,0.18200,0.0927,0.719,148.088,
1,Peaches (feat. Daniel Caesar & Giveon),Justin Bieber,4iJyoBOLtHqaGxP12qzhQI,2021-03-19,0.677,0.696,-6.181,0.1190,0.32100,0.4200,0.464,90.030,
2,RAPSTAR,Polo G,43PGPuHIlVOc04jrZVh9L6,2021-04-09,0.789,0.536,-6.862,0.2420,0.41000,0.1290,0.437,81.039,
3,Astronaut In The Ocean,Masked Wolf,3Ofmpyhv5UAQ70mENzB277,2021-01-06,0.778,0.695,-6.865,0.0913,0.17500,0.1500,0.472,149.996,
4,Up,Cardi B,1XXimziG1uhM0eDNCZCrUl,2021-02-05,0.868,0.795,-6.044,0.2690,0.00120,0.0461,0.819,166.000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,Play With Fire,Nico Santos,4XqCiPFWwAwGEI7yiq0LvK,2020-05-08,0.691,0.864,-2.779,0.0650,0.35300,0.0483,0.555,82.977,-43.236126
1010,WOW (feat. Sabrina Carpenter) - Remix,Zara Larsson,5WokFKscrfGEGGLPTu3jgO,2020-09-25,0.702,0.744,-4.063,0.0861,0.46900,0.0915,0.305,77.486,-48.620802
1011,Unfamiliar,Seeb,6tTPc9BEDYExwIEg8QGxvL,2020-07-24,0.590,0.677,-5.395,0.0577,0.00882,0.1570,0.304,184.573,-56.560466
1012,501,Various Artists,0sbboVSwurbGP9mx3EuHQo,2021-05-15,0.613,0.796,-3.672,0.2030,0.16200,0.2920,0.749,186.102,-58.153965


In [28]:
#rock/alternative
df_rock = pd.read_csv('data/df_rock.csv')
df_rock.drop(columns='Unnamed: 0',inplace=True)

In [29]:
df_rock

Unnamed: 0,track,artist,track_id,release_date,dance,energy,loud,speech,acoust,live,valence,tempo,sim_score
0,WITHOUT YOU,The Kid LAROI,27OeeYzk6klgBh83TSvGMA,2020-11-06,0.662,0.413,-7.357,0.0299,0.21300,0.1340,0.467,93.005,
1,Your Power,Billie Eilish,5qNh5WtzMbfpSj2jLlBkoD,2021-04-29,0.634,0.285,-14.007,0.0807,0.93700,0.2320,0.203,129.650,
2,my ex's best friend (with blackbear),Machine Gun Kelly,7kDUspsoYfLkWnZR7qwHZl,2020-09-25,0.731,0.675,-5.134,0.0434,0.00473,0.1410,0.298,124.939,
3,Mood (feat. iann dior),24kGoldn,4jPy3l0RUwlUI9T5XHBW2m,2021-03-26,0.701,0.716,-3.671,0.0361,0.17400,0.3240,0.732,91.007,
4,Therefore I Am,Billie Eilish,54bFM56PmE4YLRnqpW6Tha,2020-11-12,0.889,0.340,-7.773,0.0697,0.21800,0.0550,0.716,94.009,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1070,Umbrella,Rihanna,49FYlytm3dAAraYgpoJZux,2008-06-02,0.583,0.829,-4.603,0.1340,0.00864,0.0426,0.575,174.028,-66.573914
1071,Girls in the Hood,Megan Thee Stallion,6WbTTAGKrQtCQvmc5BuYfh,2020-06-26,0.821,0.863,-3.159,0.3530,0.00824,0.0377,0.874,180.013,-72.627793
1072,Partition,Beyoncé,5hgnY0mVcVetszbb85qeDg,2014-11-24,0.412,0.441,-11.523,0.2910,0.02960,0.3060,0.174,185.571,-78.149050
1073,Single Ladies (Put a Ring on It),Beyoncé,2ZBNclC5wm4GtiWaeh0DMx,2008-11-14,0.426,0.584,-5.293,0.2960,0.03830,0.1880,0.272,193.437,-85.946891


### Obtain Lyrics from Genius API

Pull lyrics from the Genius API with the lyricsgenius wrapper and put into dataframes.

In [8]:
#pull lyrics for songs in question
#need track_list and artist_list from Spotify search
track_list = ['Shallow','Thriller','Lose Yourself','Before He Cheats','Waterfalls'] #sample songs
artist_list = ['Lady Gaga','Michael Jackson','Eminem','Carrie Underwood','TLC'] #sample songs

In [9]:
#create dataframe
df = get_df_songs(track_list,artist_list)

Searching for "Shallow" by Lady Gaga...
Done.
Searching for "Thriller" by Michael Jackson...
Done.
Searching for "Lose Yourself" by Eminem...
Done.
Searching for "Before He Cheats" by Carrie Underwood...
Done.
Searching for "Waterfalls" by TLC...
Done.


### Clean and Preprocess Lyrics

In [10]:
df_clean = clean_lyrics(df,'lyrics','words')

In [11]:
df_clean = normalize_lyrics(df_clean,'words')

In [12]:
df_clean

Unnamed: 0,track,artist,lyrics,words
0,Shallow,Lady Gaga,"[Verse 1: Bradley Cooper]\nTell me somethin', ...","[tell, somethin, girl, happy, modern, world, n..."
1,Thriller,Michael Jackson,[Verse 1: Michael Jackson]\nIt's close to midn...,"[close, midnight, something, evil, lurking, da..."
2,Lose Yourself,Eminem,"[Intro]\nLook, if you had one shot or one oppo...","[look, one, shot, one, opportunity, seize, eve..."
3,Before He Cheats,Carrie Underwood,"[Verse 1]\nRight now, he's probably slow danci...","[right, probably, slow, dancing, bleachedblond..."
4,Waterfalls,TLC,[Verse 1: T-Boz]\nA lonely mother gazing out o...,"[lonely, mother, gazing, window, staring, son,..."


### Join Dataframes