# Lyrics Using Genius API

In [83]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#token provided by Genius API
%store -r client_access_token

In [3]:
#initiate Genius
genius = genius.Genius(client_access_token)
genius.verbose = False #turn off status messages

### Functions

In [4]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    track = re.sub(' - .+','',track) #remove text after '-'
    track = re.sub(' \(.*\)','',track) #remove text within parentheses
    track = re.sub(' \[.*\]','',track) #remove text within brackets
    
    try:
        return genius.search_song(track,artist).lyrics
    except:
        print(track + ' by ' + artist + ' is not available')

In [5]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [61]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace
    
    return df

In [115]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''

    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row: [w for w in row if w not in stop_words]) #remove stopwords

    lemmatizer = WordNetLemmatizer()
    
    def lemmatize_text(text):
        '''
        function returns lemmatized text
        parameters:
            text-->str
        '''
        return [lemmatizer.lemmatize(w) for w in text]
    
    df[col] = df[col].apply(lemmatize_text) #lemmatize words

    return df

### Import Songs to Analyze

Read in the resulting dataframes from the spotify_analysis notebook, which were created as follows:

 - Started with the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts (referred to as the "seed tracks")
 - Used Spotify's recommender algorithm to find the most similar songs to the seed tracks (returns a maximum of 100 songs per search)
 - Ranked the most similar songs by audio features using Euclidean distance
 - Fed the top ranking songs through Spotify's recommender algorithm until there were at least 1,000 songs per genre

In [8]:
#country
df_cty = pd.read_csv('../Data/data/df_cty.csv')
df_cty.drop(columns='Unnamed: 0',inplace=True)

In [9]:
#R&B/hip-hop
df_rb = pd.read_csv('../Data/data/df_rb.csv')
df_rb.drop(columns='Unnamed: 0',inplace=True)

In [10]:
#rock/alternative
df_rock = pd.read_csv('../Data/data/df_rock.csv')
df_rock.drop(columns='Unnamed: 0',inplace=True)

### Obtain Lyrics from Genius API

Pull lyrics from the Genius API with the lyricsgenius wrapper and put into dataframes.

In [12]:
#country
df_cty_lyrics = get_df_songs(df_cty['track'],df_cty['artist'])

One Beer by HIXTAPE is not available
Ballin' by Rvshvd is not available
Gabrielle by Brett Eldredge is not available
Forever Begins Tonight by The McClymonts is not available
Tupelo Honey by JJ Grey & Mofro is not available
#REDNEK by Gord Bamford is not available


In [13]:
#R&B/hip-hop
df_rb_lyrics = get_df_songs(df_rb['track'],df_rb['artist'])

Bang Bang by Jessie J is not available
Before He Cheats by Carrie Underwood is not available
See What I've Become by Zack Hemsey is not available
Stinger by RL Grime is not available
Money On The Table by Belly is not available


In [14]:
#rock
df_rock_lyrics = get_df_songs(df_rock['track'],df_rock['artist'])

Greatest Hits Megamix by The Saturdays is not available
Smells Blood by Kensuke Ushio is not available
Ghost by Machine Girl is not available
Unforgettable by French Montana is not available


### Clean and Preprocess Lyrics

Prepare lyrics for analysis by cleaning and normalizing them.

In [15]:
pd.set_option('mode.chained_assignment', None)

In [120]:
#drop rows without lyrics
df_cty_lyrics2 = df_cty_lyrics.dropna(subset=['lyrics']) #country
df_rb_lyrics2 = df_rb_lyrics.dropna(subset=['lyrics']) #R&B/hip-hop
df_rock_lyrics2 = df_rock_lyrics.dropna(subset=['lyrics']) #rock/alternative

In [121]:
#clean lyrics
df_cty_cleaned = clean_lyrics(df_cty_lyrics2,'lyrics','words') #country
df_rb_cleaned = clean_lyrics(df_rb_lyrics2,'lyrics','words') #R&B/hip-hop
df_rock_cleaned = clean_lyrics(df_rock_lyrics2,'lyrics','words') #rock/alternative

In [122]:
#normalize lyrics
df_cty_norm = normalize_lyrics(df_cty_cleaned,'words') #country
df_rb_norm = normalize_lyrics(df_rb_cleaned,'words') #R&B/hip-hop
df_rock_norm = normalize_lyrics(df_rock_cleaned,'words') #rock/alternative

### Write Dataframes to File

In [158]:
#country
df_cty_norm.to_csv('../Data/df_cty_lyrics.csv')

In [159]:
#R&B/hip-hop
df_rb_norm.to_csv('../Data/df_rb_lyrics.csv')

In [160]:
#rock/alternative
df_rock_norm.to_csv('../Data/df_rock_lyrics.csv')