# Data Gathering and Preparation

In [1]:
#importing all necessary packages
import apiKey
import lyricsgenius as lyrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sukum\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
!pip install lyricsgenius




In [5]:
#To access the client key
keyapi = apiKey.key

In [7]:
#The lyrics are scarped from Genius using their built-in api, which is initialized here
geniusapi = lyrics.Genius(keyapi)

A function to scrape the lyrics using the genius api. We select 15 artists who have atleast 50 songs in order to get a varied dataset(read report for details).

In [10]:
def scrapeLyrics(api, artist_list):
    """
    This function scrapes the lyrics using Genius API
    
    input:
    api - Genius API client key
    artist_list - List containing names of artists or groups
    
    output:
    Pandas DataFrame containing 2 columns - Artists and Lyrics
    """
    lyrics = []
    artist_l = []
    
    for artist in artist_list:
        success = False
        while(not success):
            try:
                songs = api.search_artist(artist, max_songs = 50)
                success = True
            except:
                continue
            
        for song in songs.songs:
            if(song.lyrics):
                lyrics.append(song.lyrics)
                artist_l.append(artist)
    
    return pd.DataFrame(list(zip(artist_l, lyrics)), columns = ['Artist', 'Lyrics'])

In [12]:
artist_names = ['Metallica', 'Ariana-grande', 'Coldplay', 'Billie-eilish', 'Beyonce', 'Charlie-puth','Ed-sheeran', 'Dua-lipa', 'Radiohead', 'Dream-theater', 'Stevie-wonder', 'Sting', 'Taylor-swift', 'Maroon-5', 'Justin-bieber']

In [14]:
data = scrapeLyrics(geniusapi, artist_names)

Searching for songs by Metallica...

Song 1: "One"
Song 2: "Nothing Else Matters"
Song 3: "Enter Sandman"
Song 4: "Master of Puppets"
Song 5: "The Unforgiven"
Song 6: "Fade to Black"
Song 7: "For Whom the Bell Tolls"
Song 8: "Creeping Death"
Song 9: "Sad but True"
Song 10: "Ride the Lightning"
Song 11: "Fuel"
Song 12: "...And Justice for All"
Song 13: "Welcome Home (Sanitarium)"
Song 14: "Wherever I May Roam"
Song 15: "Blackened"
Song 16: "Battery"
Song 17: "The Unforgiven II"
Song 18: "The Four Horsemen"
Song 19: "Whiplash"
Song 20: "Whiskey in the Jar"
Song 21: "Dyers Eve"
Song 22: "Seek & Destroy"
Song 23: "The Unforgiven III"
Song 24: "Eye of the Beholder"
Song 25: "To Live Is to Die"
Song 26: "Turn the Page"
Song 27: "Disposable Heroes"
Song 28: "The Day That Never Comes"
Song 29: "Harvester of Sorrow"
Song 30: "St. Anger"
Song 31: "The Thing That Should Not Be"
Song 32: "King Nothing"
Song 33: "Atlas, Rise!"
Song 34: "72 Seasons"
Song 35: "Don’t Tread on Me"
Song 36: "Orion"
Song

The data from Genius is converted to a dataframe using pandas. 

In [16]:
data.to_csv('Complete.csv', encoding='utf-8', index = False)

In [18]:
data.head()

Unnamed: 0,Artist,Lyrics
0,Metallica,147 ContributorsTranslationsFrançaisPortuguêsD...
1,Metallica,114 ContributorsTranslationsPolskiDeutschPortu...
2,Metallica,138 ContributorsTranslationsPortuguêsNederland...
3,Metallica,152 ContributorsTranslationsPortuguêsEspañolDe...
4,Metallica,94 ContributorsTranslationsDeutschPortuguêsNed...


The data is cleaned like 'verse/intro/chorus/bridge/instrumental' are removed, multiple sentence endings are removed etc. 

In [23]:
def cleanlyrics(dataframe):
    """
    Method to remove special characters in the lyrics column
    
    input:
    dataframe - Data containing artists and Song Lyrics
    
    output:
    dataframe - Cleaned Lyrics
    """
    dataframe['Lyrics'] = dataframe['Lyrics'].str.lower()
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"\d", "", regex=True)  # Remove digits
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"intro|instrumental|chorus|verse|bridge|outro", "", regex=True)  # Remove unwanted words
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"\[", "", regex=True)  # Escape [
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"\]", "", regex=True)  # Escape ]
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"  ", " ", regex=True)  # Remove extra spaces
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"\t", "", regex=True)  # Remove tabs
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(r"[^\w\d'\s]+", "", regex=True)  # Remove non-alphanumeric characters
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace(":", "", regex=True)  # Remove colons
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace("\n \n \n", "\n", regex=True)  # Normalize newlines
    dataframe['Lyrics'] = dataframe['Lyrics'].str.replace("\n", " \n ", regex=True)  # Add space around newlines
    dataframe['Lyrics'] = dataframe['Lyrics'].str.strip()  # Strip leading/trailing spaces

    return dataframe

cleanlyrics(data)

Unnamed: 0,Artist,Lyrics
0,Metallica,contributorstranslationsfrançaisportuguêsdeuts...
1,Metallica,contributorstranslationspolskideutschportuguês...
2,Metallica,contributorstranslationsportuguêsnederlandsdeu...
3,Metallica,contributorstranslationsportuguêsespañoldeutsc...
4,Metallica,contributorstranslationsdeutschportuguêsnederl...
...,...,...
745,Justin-bieber,contributorstranslationsespañolbeen you lyrics...
746,Justin-bieber,contributorstranslationstürkçeespañolportuguês...
747,Justin-bieber,contributorshard face reality lyrics justin bi...
748,Justin-bieber,contributorstranslationstürkçeespañolportuguês...


In [25]:
data.head(5)

Unnamed: 0,Artist,Lyrics
0,Metallica,contributorstranslationsfrançaisportuguêsdeuts...
1,Metallica,contributorstranslationspolskideutschportuguês...
2,Metallica,contributorstranslationsportuguêsnederlandsdeu...
3,Metallica,contributorstranslationsportuguêsespañoldeutsc...
4,Metallica,contributorstranslationsdeutschportuguêsnederl...


The data is now sent to a sentiment analyzer that classifies the lyrics on the basis of positive/negative sentiment. Astonishingly, we had more positive lyrics than negative ones. True numbers are specified in the dataset section of the report. 

In [28]:
def getsentiment(dataframe):
    """
    Method to get sentiment analysis for the Lyrics in the dataframe
    
    input:
    Dataframe - data containing details regarding artists and song lyrics
    
    output:
    Dataframe - data containing details regarding artist, song lyrics and corresponding lyrics sentiments
    
    """
    sentlist = []
    #Initializing the sentiment analyzer
    vader = SentimentIntensityAnalyzer()
    
    #Looping through each row, and based on the Lyrics,
    #the sentiment analyzer scores the lyrics for amount of negative and positive
    for row in range(len(dataframe)):
        sentiment = vader.polarity_scores(dataframe.loc[row, "Lyrics"])
        if sentiment['neg'] > sentiment['pos']:
            sentlist.append(0)
        else:
            sentlist.append(1)
    
    dataframe['Sentiment'] = sentlist


getsentiment(data)

In [30]:
data.groupby(data.Sentiment).nunique()

Unnamed: 0_level_0,Artist,Lyrics
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,15,263
1,15,487


In [32]:
sortedsent = data.groupby(data.Sentiment)

The split data is grouped into two different datasets- positive lyrics dataset and negative lyrics dataset which is used as sample input for our lyric generator. 

In [35]:
#Sorting based on sentiment column value
sortedsentpos = sortedsent.get_group(1)
sortedsentneg = sortedsent.get_group(0)

In [37]:
#Sorted positive sentiments
sortedsentpos

Unnamed: 0,Artist,Lyrics,Sentiment
2,Metallica,contributorstranslationsportuguêsnederlandsdeu...,1
3,Metallica,contributorstranslationsportuguêsespañoldeutsc...,1
8,Metallica,contributorstranslationsportuguêsnederlandssad...,1
11,Metallica,contributorsand justice for all lyrics \n hal...,1
13,Metallica,contributorstranslationsportuguêsnederlandswhe...,1
...,...,...,...
742,Justin-bieber,contributorsboyfriend remix lyrics asher roth ...,1
743,Justin-bieber,contributorstranslationstürkçeespañolportuguês...,1
746,Justin-bieber,contributorstranslationstürkçeespañolportuguês...,1
748,Justin-bieber,contributorstranslationstürkçeespañolportuguês...,1


In [39]:
#Sorted negative sentiments
sortedsentneg

Unnamed: 0,Artist,Lyrics,Sentiment
0,Metallica,contributorstranslationsfrançaisportuguêsdeuts...,0
1,Metallica,contributorstranslationspolskideutschportuguês...,0
4,Metallica,contributorstranslationsdeutschportuguêsnederl...,0
5,Metallica,contributorstranslationstürkçedeutschfade to b...,0
6,Metallica,contributorstranslationsdeutschfor whom the be...,0
...,...,...,...
727,Justin-bieber,contributorsmaria lyrics matt lauer justin bi...,0
728,Justin-bieber,contributorstranslationstürkçeespañolforever l...,0
744,Justin-bieber,contributorstranslationsespañolwe are lyrics j...,0
745,Justin-bieber,contributorstranslationsespañolbeen you lyrics...,0


In [41]:
#Saving the positive and negative sentiment files separately
sortedsentpos.to_csv('positive.csv', encoding='utf-8', index = False)
sortedsentneg.to_csv('negative.csv', encoding='utf-8', index = False)