# Lyrics Using Genius API

In [1]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#token provided by Genius API
%store -r client_access_token

In [3]:
#initiate Genius
genius = genius.Genius(client_access_token)
genius.verbose = False #turn off status messages

### Functions

In [4]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    track = re.sub(' - .+','',track) #remove text after '-'
    track = re.sub(' \(.*\)','',track) #remove text within parentheses
    track = re.sub(' \[.*\]','',track) #remove text within brackets
    
    try:
        return genius.search_song(track,artist).lyrics
    except:
        print(track + ' by ' + artist + ' is not available')

In [5]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [6]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace

    return df

In [7]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''
    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row:[w for w in row if w not in stop_words]) #remove stopwords
    
    def lemmatize_text(text):
        '''
        function returns lemmatized text
        parameters:
            text-->str
        '''
        lemmatizer = WordNetLemmatizer()
        
        return [lemmatizer.lemmatize(w) for w in text]
    
    df[col] = df[col].apply(lemmatize_text) #lemmatize words
    
    return df

### Import Songs to Analyze

Read in the resulting dataframes from the spotify_analysis notebook, which were created as follows:

 - Started with the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts (referred to as the "seed tracks")
 - Used Spotify's recommender algorithm to find the most similar songs to the seed tracks (returns a maximum of 100 songs per search)
 - Ranked the most similar songs by audio features using Euclidean distance
 - Fed the top ranking songs through Spotify's recommender algorithm until there were at least 1,000 songs per genre

In [8]:
#country
df_cty = pd.read_csv('data/df_cty.csv')
df_cty.drop(columns='Unnamed: 0',inplace=True)

In [9]:
#R&B/hip-hop
df_rb = pd.read_csv('data/df_rb.csv')
df_rb.drop(columns='Unnamed: 0',inplace=True)

In [10]:
#rock/alternative
df_rock = pd.read_csv('data/df_rock.csv')
df_rock.drop(columns='Unnamed: 0',inplace=True)

### Obtain Lyrics from Genius API

Pull lyrics from the Genius API with the lyricsgenius wrapper and put into dataframes.

In [11]:
#country
df_cty_lyrics = get_df_songs(df_cty['track'],df_cty['artist'])

One Beer by HIXTAPE is not available
Ballin' by Rvshvd is not available
Forever Begins Tonight by The McClymonts is not available
Tupelo Honey by JJ Grey & Mofro is not available
Waves by Luke Bryan is not available
#REDNEK by Gord Bamford is not available
When Your Lips Are so Close by Gord Bamford is not available


In [12]:
#R&B/hip-hop
df_rb_lyrics = get_df_songs(df_rb['track'],df_rb['artist'])

Opps by JayDaYoungan is not available
Young Grizzley World by Tee Grizzley is not available
Moonwalking in Calabasas by DDG is not available
What It Was by Lil Gotit is not available
I Kissed A Girl by Katy Perry is not available
Before He Cheats by Carrie Underwood is not available
...Baby One More Time by Britney Spears is not available
See What I've Become by Zack Hemsey is not available
Stinger by RL Grime is not available
X by Jonas Brothers is not available


In [14]:
#rock
df_rock_lyrics = get_df_songs(df_rock['track'],df_rock['artist'])

Greatest Hits Megamix by The Saturdays is not available
Lost by Cher Lloyd is not available
Smells Blood by Kensuke Ushio is not available
Ghost by Machine Girl is not available


### Clean and Preprocess Lyrics

Prepare lyrics for analysis by cleaning and normalizing them.

In [16]:
#country
#df_cty_lyrics = clean_lyrics(df_cty_lyrics,'lyrics','words')
#df_cty_lyrics = normalize_lyrics(df_cty_lyrics,'words')
clean_lyrics(df_cty_lyrics[df_cty_lyrics['lyrics'].notna()],'lyrics','words')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = df[col].str.lower() #make all text lowercase
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remo

Unnamed: 0,track,artist,lyrics,words
0,Forever After All,Luke Combs,[Verse 1]\nA cold beer's got twelve ounces\nA ...,a cold beer got twelve ounces a good truck got...
1,The Good Ones,Gabby Barrett,[Verse 1]\nHe's a phone call to his parents\nH...,he a phone call to his parents he a bible by t...
2,Made For You,Jake Owen,[Verse 1]\nWater towers are made for hearts an...,water towers are made for hearts and names fri...
3,Hell Of A View,Eric Church,[Verse 1]\nI was no daddy's dream\nWas not you...,i was no daddy dream was not your momma prayer...
4,Breaking Up Was Easy In The 90's,Sam Hunt,"[Intro]\nYeah, man, oh man, oh man\n\n[Verse 1...",yeah man oh man oh man bartender looked at m...
...,...,...,...,...
1021,Hometown,Logan Mize,[Verse 1]\nHow'd I end up here where I don't b...,how i end up here where i don belong like a st...
1022,Drinkin' Hours,Cole Swindell,[Intro]\nHey\nWoo!\n\n[Verse 1]\nBeen watchin'...,hey woo been watchin that clock ticktock all...
1023,Grass Is Always Greener,Jake Owen,[Verse 1: Jake Owen]\nNeighbor got a brand new...,neighbor got a brand new cadillac still got my...
1024,Got What I Got,Jason Aldean,"[Verse 1]\nShe said, ""Baby, do you ever miss b...",she said baby do you ever miss bein alone i sa...


In [None]:
#R&B/hip-hop
df_rb_lyrics = clean_lyrics(df_rb_lyrics,'lyrics','words')
df_rb_lyrics = normalize_lyrics(df_rb_lyrics,'words')

In [None]:
#rock/alternative
df_rock_lyrics = clean_lyrics(df_rock_lyrics,'lyrics','words')
df_rock_lyrics = normalize_lyrics(df_rock_lyrics,'words')

### Join Dataframes

Combine dataframes with audio features and lyrics for each genre.

In [None]:
#country
df_cty_w_lyrics = 

In [None]:
#R&B/hip-hop
df_rb_w_lyrics = 

In [None]:
#rock/alternative
df_rock_w_lyrics = 

### Write Dataframes to File

In [None]:
#country
df_cty_w_lyrics.to_csv('df_cty_w_lyrics.csv')

In [None]:
#R&B/hip-hop
df_rb_w_lyrics.to_csv('df_rb_w_lyrics.csv')

In [None]:
#rock/alternative
df_rock_w_lyrics.to_csv('df_rock_w_lyrics.csv')