In [1]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re
import contractions

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#token provided by Genius API
%store -r client_access_token

In [3]:
#initiate Genius
genius = genius.Genius(client_access_token)

In [4]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    
    return genius.search_song(track,artist).lyrics

In [5]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [6]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r'[^\w\d\'\s]+','').\
                              str.replace(r"\'s",'').str.replace(r"\'t",'') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace

    return df

In [8]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''
    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row:[w for w in row if w not in stop_words]) #remove stopwords
    
    lemmatizer = WordNetLemmatizer()
    df[col] = df[col].apply(lambda row:[lemmatizer.lemmatize(w) for w in row]) #lemmatize words
    
    return df

In [9]:
#pull lyrics for songs in question
#need track_list and artist_list from Spotify search
track_list = ['Shallow','Thriller','Lose Yourself','Before He Cheats','Waterfalls'] #sample songs
artist_list = ['Lady Gaga','Michael Jackson','Eminem','Carrie Underwood','TLC'] #sample songs

In [10]:
#create dataframe
df = get_df_songs(track_list,artist_list)

Searching for "Shallow" by Lady Gaga...
Done.
Searching for "Thriller" by Michael Jackson...
Done.
Searching for "Lose Yourself" by Eminem...
Done.
Searching for "Before He Cheats" by Carrie Underwood...
Done.
Searching for "Waterfalls" by TLC...
Done.


In [11]:
df

Unnamed: 0,track,artist,lyrics
0,Shallow,Lady Gaga,"[Verse 1: Bradley Cooper]\nTell me somethin', ..."
1,Thriller,Michael Jackson,[Verse 1: Michael Jackson]\nIt's close to midn...
2,Lose Yourself,Eminem,"[Intro]\nLook, if you had one shot or one oppo..."
3,Before He Cheats,Carrie Underwood,"[Verse 1]\nRight now, he's probably slow danci..."
4,Waterfalls,TLC,[Verse 1: T-Boz]\nA lonely mother gazing out o...


In [12]:
df_clean = clean_lyrics(df,'lyrics','words')

In [13]:
df_clean

Unnamed: 0,track,artist,lyrics,words
0,Shallow,Lady Gaga,"[Verse 1: Bradley Cooper]\nTell me somethin', ...",tell me somethin' girl are you happy in this m...
1,Thriller,Michael Jackson,[Verse 1: Michael Jackson]\nIt's close to midn...,it close to midnight something evil lurking in...
2,Lose Yourself,Eminem,"[Intro]\nLook, if you had one shot or one oppo...",look if you had one shot or one opportunity to...
3,Before He Cheats,Carrie Underwood,"[Verse 1]\nRight now, he's probably slow danci...",right now he probably slow dancing with a blea...
4,Waterfalls,TLC,[Verse 1: T-Boz]\nA lonely mother gazing out o...,a lonely mother gazing out of the window stari...


In [14]:
df_clean = normalize_lyrics(df_clean,'words')

In [15]:
df_clean

Unnamed: 0,track,artist,lyrics,words
0,Shallow,Lady Gaga,"[Verse 1: Bradley Cooper]\nTell me somethin', ...","[tell, somethin', girl, happy, modern, world, ..."
1,Thriller,Michael Jackson,[Verse 1: Michael Jackson]\nIt's close to midn...,"[close, midnight, something, evil, lurking, da..."
2,Lose Yourself,Eminem,"[Intro]\nLook, if you had one shot or one oppo...","[look, one, shot, one, opportunity, seize, eve..."
3,Before He Cheats,Carrie Underwood,"[Verse 1]\nRight now, he's probably slow danci...","[right, probably, slow, dancing, bleachedblond..."
4,Waterfalls,TLC,[Verse 1: T-Boz]\nA lonely mother gazing out o...,"[lonely, mother, gazing, window, staring, son,..."


In [16]:
df_clean.iloc[0].words

['tell',
 "somethin'",
 'girl',
 'happy',
 'modern',
 'world',
 'need',
 "somethin'",
 'else',
 "searchin'",
 "i'm",
 "fallin'",
 'good',
 'time',
 'find',
 'longing',
 'change',
 'bad',
 'time',
 'fear',
 'tell',
 'something',
 'boy',
 'tired',
 'tryna',
 'fill',
 'void',
 'need',
 'hard',
 "keepin'",
 'hardcore',
 "i'm",
 'falling',
 'good',
 'time',
 'find',
 'longing',
 'change',
 'bad',
 'time',
 'fear',
 "i'm",
 'deep',
 'end',
 'watch',
 'dive',
 "i'll",
 'never',
 'meet',
 'ground',
 'crash',
 'surface',
 'hurt',
 'u',
 "we're",
 'far',
 'shallow',
 'shaha',
 'shahallow',
 'shaha',
 'shalalalalow',
 'shaha',
 'shahallow',
 "we're",
 'far',
 'shallow',
 'oh',
 'ha',
 'ah',
 'ha',
 'ohah',
 'ha',
 "i'm",
 'deep',
 'end',
 'watch',
 'dive',
 "i'll",
 'never',
 'meet',
 'ground',
 'crash',
 'surface',
 'hurt',
 'u',
 "we're",
 'far',
 'shallow',
 'shaha',
 'shallow',
 'shaha',
 'shalalalalow',
 'shaha',
 'shallow',
 "we're",
 'far',
 'shallow']

In [None]:
nltk.download('wordnet')

In [17]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r