In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bs4
import requests
import warnings
import re
import html
import unidecode

from matplotlib import rcParams

rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = 'arial'

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
nonblm_playlist = pd.read_csv('non-BLM.csv')
nonblm_playlist

Unnamed: 0,song_name,artist_name,song_type
0,thank god i found you,mariah carey,r&b
1,stutter,joe,r&b
2,go on girl,ne-yo,r&b
3,stingy,ginuwine,r&b
4,that girl,marques houston,r&b
5,talkin' to me,amerie,r&b
6,i should have cheated,keyshia cole,r&b
7,he is,brandy,r&b
8,shake it off,mariah carey,r&b
9,into you,fabolous,r&b


In [3]:
# categorise songs based on their genres
nonblm_hiphop = nonblm_playlist[nonblm_playlist['song_type']=='hip-hop']
nonblm_rb = nonblm_playlist[nonblm_playlist['song_type']=='r&b']

In [4]:
nonblm_hiphop

Unnamed: 0,song_name,artist_name,song_type
19,beautiful,snoop dogg,hip-hop
20,how we do,the game,hip-hop
21,like a pimp,david banner,hip-hop
22,oh boy,camron,hip-hop
23,it wasn't me,shaggy,hip-hop
24,party up,dmx,hip-hop
25,country grammar (hot shit),nelly,hip-hop
26,in da club,50 cent,hip-hop
27,what's your fantasy,ludacris,hip-hop
28,shake ya tailfeather,murphy lee,hip-hop


In [5]:
# clean song titles and artist names
def clean_song(x):
    x = x.replace('-', ' ')
    x = re.sub(r'[^\w\s]','', x)
    x = unidecode.unidecode(x)
    return x

nonblm_hiphop[['song_name']]\
    = nonblm_hiphop[['song_name']].applymap(
        lambda x: x.replace('&', 'and'))

nonblm_hiphop[['song_name', 'artist_name']]\
    = nonblm_hiphop[['song_name', 'artist_name']].applymap(clean_song)

In [6]:
lyrics_list = list() # collector for lyric strings or None if no lyrics found
source_list = list() # collector for the URL from which lyrics were obtained

for index, row in nonblm_hiphop.iterrows():
    artist = row['artist_name']
    song = row['song_name']
    source = None
    
    # generate URLs for each website
    urls = list()
    metro_url = 'http://metrolyrics.com/{}-lyrics-{}.html'.format(
        song.replace(' ', '-'), artist.replace(' ', '-'))
    mode_url = 'http://www.lyricsmode.com/lyrics/{}/{}/{}.html'.format(
        artist[0], artist.replace(' ', '_'), song.replace(' ', '_'))
    genius_url = 'https://genius.com/{}-{}-lyrics'.format(
        artist.replace(' ', '-'), song.replace(' ', '-'))
    urls.extend([genius_url, metro_url, mode_url])

    # lyrics collected as list of tokens
    for url in urls:
        try:
            # request HTML and parse
            html = requests.get(url=url).content
            soup = bs4.BeautifulSoup(html)
            
            # find lyrics and pre-process
            if 'genius.com' in url:
                print(url)
                lyrics = soup\
                    .find('div', {'class': 'lyrics'})\
                    .find('p')\
                    .findAll(text=True)
                if lyrics is None:
                    raise Exception
                source = 'genius.com'
            
            if 'lyricsmode' in url:
                print(url)
                lyrics = soup.find('p', {'id': 'lyrics_text'})
                if lyrics is None:
                    raise Exception
                lyrics = lyrics.text.split()
                source = 'lyricsmode.com'
            if 'metrolyrics' in url:
                print(url)
                lyrics = soup.find('div', {'id': 'lyrics-body-text'})
                if lyrics is None:
                    raise Exception
                lyrics = [line.findAll(text=True)\
                          for line in soup.findAll('p', {'class': 'verse'})]
                lyrics = [item for sublist in lyrics for item in sublist]
                source = 'metrolyrics.com'
            
            break
        except:
            print('error')
    lyrics_list.append(lyrics)
    source_list.append(source)

https://genius.com/snoop-dogg-beautiful-lyrics
https://genius.com/the-game-how-we-do-lyrics
https://genius.com/david-banner-like-a-pimp-lyrics
https://genius.com/camron-oh-boy-lyrics
https://genius.com/shaggy-it-wasnt-me-lyrics
https://genius.com/dmx-party-up-lyrics
https://genius.com/nelly-country-grammar-hot-shit-lyrics
https://genius.com/50-cent-in-da-club-lyrics
https://genius.com/ludacris-whats-your-fantasy-lyrics
error
http://metrolyrics.com/whats-your-fantasy-lyrics-ludacris.html
https://genius.com/murphy-lee-shake-ya-tailfeather-lyrics
https://genius.com/camron-hey-ma-lyrics
error
http://metrolyrics.com/hey-ma-lyrics-camron.html
https://genius.com/snoop-dogg-drop-it-like-its-hot-lyrics
error
http://metrolyrics.com/drop-it-like-its-hot-lyrics-snoop-dogg.html
https://genius.com/ja-rule-always-on-time-lyrics
error
http://metrolyrics.com/always-on-time-lyrics-ja-rule.html
https://genius.com/g-unit-wanna-get-to-know-you-lyrics
https://genius.com/da-entourage-bunny-hop-lyrics
https:/

In [7]:
def clean_lyrics(lyrics):
    if lyrics is None:
        return lyrics
    
    # combine lists of tokens into single string
    lyrics = ' '.join(lyrics)
            
    # remove apostrophes
    lyrics = lyrics.replace('\'', '')
            
    # remove song structure tags or instructions in brackets
    lyrics = re.sub(r'[\*\[|\(|\{].*\n*.*[\]\)\}\*]' , ' ', lyrics)
            
    # replace all punctuations with spaces
    lyrics = re.sub(r'[^\w\s]', ' ', lyrics)
            
    # replace consecutive whitespaces with single space
    lyrics = re.sub(r'\s+', ' ', lyrics)
    
    # convert all tokens to lowercase
    lyrics = lyrics.lower()
    
    return lyrics

In [8]:
# clean and get lyrics
lyrics_list_clean = list(map(clean_lyrics, lyrics_list))
nonblm_hiphop['lyrics'] = lyrics_list_clean
lyrics = nonblm_hiphop['lyrics'].dropna()
lyrics

19     beautiful i just want you to know youre my fa...
20     this is how we do we make a move and act a fo...
21     real girls get down on the flo on the flo rea...
22     oh baby uh killa look at his car look mami im...
23     yo man yo open up man yo what you want man my...
24    uhh uh whoo yall gon make me lose my mind up i...
25     mmmmm im going down down baby your street in ...
26     go go go go go go go shorty its your birthday...
27    ludacris yeah yeah yeah yeah give it to me now...
28     we do it for fun we just do it for fun derrty...
29    whats up lets slide all right all right and we...
30    snooooooop snooooooop when the pimps in the cr...
31    always there when you call always on time gave...
32     i wanna get to know ya i really wanna fuck yo...
33    da da da da da daaa daaa all the hoes with the...
34     where they at where they at where they at whe...
35     rollout rollout rollout rollout rollout rollo...
36     owwwwww yeah my niggas ahah throw your ha

In [9]:
nonblm_hiphop

Unnamed: 0,song_name,artist_name,song_type,lyrics
19,beautiful,snoop dogg,hip-hop,beautiful i just want you to know youre my fa...
20,how we do,the game,hip-hop,this is how we do we make a move and act a fo...
21,like a pimp,david banner,hip-hop,real girls get down on the flo on the flo rea...
22,oh boy,camron,hip-hop,oh baby uh killa look at his car look mami im...
23,it wasnt me,shaggy,hip-hop,yo man yo open up man yo what you want man my...
24,party up,dmx,hip-hop,uhh uh whoo yall gon make me lose my mind up i...
25,country grammar hot shit,nelly,hip-hop,mmmmm im going down down baby your street in ...
26,in da club,50 cent,hip-hop,go go go go go go go shorty its your birthday...
27,whats your fantasy,ludacris,hip-hop,ludacris yeah yeah yeah yeah give it to me now...
28,shake ya tailfeather,murphy lee,hip-hop,we do it for fun we just do it for fun derrty...


In [10]:
nonblm_hiphop.columns

Index(['song_name', 'artist_name', 'song_type', 'lyrics'], dtype='object')

In [11]:
# remove other elements, only lyrics are remianed
to_drop = ['song_name',
           'artist_name',
           'song_type']
nonblm_hiphop.drop(columns=to_drop, inplace=True) 

In [12]:
# save as csv file
nonblm_hiphop.to_csv('nonblm_hiphop_lyricsonly.csv', index=True)

In [13]:
nonblm_rb

Unnamed: 0,song_name,artist_name,song_type
0,thank god i found you,mariah carey,r&b
1,stutter,joe,r&b
2,go on girl,ne-yo,r&b
3,stingy,ginuwine,r&b
4,that girl,marques houston,r&b
5,talkin' to me,amerie,r&b
6,i should have cheated,keyshia cole,r&b
7,he is,brandy,r&b
8,shake it off,mariah carey,r&b
9,into you,fabolous,r&b


In [14]:
# clean song titles and artist names
def clean_song(x):
    x = x.replace('-', ' ')
    x = re.sub(r'[^\w\s]','', x)
    x = unidecode.unidecode(x)
    return x

nonblm_rb[['song_name']]\
    = nonblm_rb[['song_name']].applymap(
        lambda x: x.replace('&', 'and'))

nonblm_rb[['song_name', 'artist_name']]\
    = nonblm_rb[['song_name', 'artist_name']].applymap(clean_song)

In [15]:
lyrics_list = list() # collector for lyric strings or None if no lyrics found
source_list = list() # collector for the URL from which lyrics were obtained

for index, row in nonblm_rb.iterrows():
    artist = row['artist_name']
    song = row['song_name']
    source = None
    
    # generate URLs for each website
    urls = list()
    metro_url = 'http://metrolyrics.com/{}-lyrics-{}.html'.format(
        song.replace(' ', '-'), artist.replace(' ', '-'))
    mode_url = 'http://www.lyricsmode.com/lyrics/{}/{}/{}.html'.format(
        artist[0], artist.replace(' ', '_'), song.replace(' ', '_'))
    genius_url = 'https://genius.com/{}-{}-lyrics'.format(
        artist.replace(' ', '-'), song.replace(' ', '-'))
    urls.extend([genius_url, mode_url, metro_url,])

    # lyrics collected as list of tokens
    for url in urls:
        try:
            # request HTML and parse
            html = requests.get(url=url).content
            soup = bs4.BeautifulSoup(html)
            
            # find lyrics and pre-process
            if 'genius.com' in url:
                print(url)
                lyrics = soup\
                    .find('div', {'class': 'lyrics'})\
                    .find('p')\
                    .findAll(text=True)
                if lyrics is None:
                    raise Exception
                source = 'genius.com'
            
            if 'lyricsmode' in url:
                print(url)
                lyrics = soup.find('p', {'id': 'lyrics_text'})
                if lyrics is None:
                    raise Exception
                lyrics = lyrics.text.split()
                source = 'lyricsmode.com'
            if 'metrolyrics' in url:
                print(url)
                lyrics = soup.find('div', {'id': 'lyrics-body-text'})
                if lyrics is None:
                    raise Exception
                lyrics = [line.findAll(text=True)\
                          for line in soup.findAll('p', {'class': 'verse'})]
                lyrics = [item for sublist in lyrics for item in sublist]
                source = 'metrolyrics.com'
            
            break
        except:
            print('error')
    lyrics_list.append(lyrics)
    source_list.append(source)

https://genius.com/mariah-carey-thank-god-i-found-you-lyrics
https://genius.com/joe-stutter-lyrics
error
http://www.lyricsmode.com/lyrics/j/joe/stutter.html
error
http://metrolyrics.com/stutter-lyrics-joe.html
https://genius.com/ne-yo-go-on-girl-lyrics
https://genius.com/ginuwine-stingy-lyrics
error
http://www.lyricsmode.com/lyrics/g/ginuwine/stingy.html
error
http://metrolyrics.com/stingy-lyrics-ginuwine.html
https://genius.com/marques-houston-that-girl-lyrics
error
http://www.lyricsmode.com/lyrics/m/marques_houston/that_girl.html
error
http://metrolyrics.com/that-girl-lyrics-marques-houston.html
https://genius.com/amerie-talkin-to-me-lyrics
error
http://www.lyricsmode.com/lyrics/a/amerie/talkin_to_me.html
error
http://metrolyrics.com/talkin-to-me-lyrics-amerie.html
https://genius.com/keyshia-cole-i-should-have-cheated-lyrics
error
http://www.lyricsmode.com/lyrics/k/keyshia_cole/i_should_have_cheated.html
error
http://metrolyrics.com/i-should-have-cheated-lyrics-keyshia-cole.html
http

In [16]:
# clean and get lyrics
lyrics_list_clean = list(map(clean_lyrics, lyrics_list))
nonblm_rb['lyrics'] = lyrics_list_clean
lyrics = nonblm_rb['lyrics'].dropna()
lyrics

0      i would give up everything before id separate...
1     b b baby dont go b b baby dont go b b baby don...
2      i cant get it back but i dont want it back i ...
3     you know what baby i need to tell you somethin...
4      yow can i talk to you for a minute look all i...
5     funny how its hard to find the words sometimes...
6     baby first of all let me say you cant accuse m...
7     its the only explanation to the question at ha...
8      ooh ooh ooh ooh ooh ooh everybody just everyb...
9      desert storm i cant really explain it im so i...
10     maybe this decision was a mistake you probabl...
11    darkchild yeah oh yeah toni braxton oh ha ha u...
12     yeah lets get real comfortable lay back what ...
13     shake it shake it shake it shake it shake it ...
14     i used to think that i wasnt fine enough and ...
15     what you gonna do you wanna get down tell me ...
16    oh mmm i know a place aint nobody cryin aint n...
17     mr big stuff who do you think you are mr 

In [17]:
nonblm_rb

Unnamed: 0,song_name,artist_name,song_type,lyrics
0,thank god i found you,mariah carey,r&b,i would give up everything before id separate...
1,stutter,joe,r&b,b b baby dont go b b baby dont go b b baby don...
2,go on girl,ne yo,r&b,i cant get it back but i dont want it back i ...
3,stingy,ginuwine,r&b,you know what baby i need to tell you somethin...
4,that girl,marques houston,r&b,yow can i talk to you for a minute look all i...
5,talkin to me,amerie,r&b,funny how its hard to find the words sometimes...
6,i should have cheated,keyshia cole,r&b,baby first of all let me say you cant accuse m...
7,he is,brandy,r&b,its the only explanation to the question at ha...
8,shake it off,mariah carey,r&b,ooh ooh ooh ooh ooh ooh everybody just everyb...
9,into you,fabolous,r&b,desert storm i cant really explain it im so i...


In [18]:
nonblm_rb.columns

Index(['song_name', 'artist_name', 'song_type', 'lyrics'], dtype='object')

In [19]:
# remove other elements, only lyrics are remianed
to_drop = ['song_name',
           'artist_name',
           'song_type']
nonblm_rb.drop(columns=to_drop, inplace=True) 

In [20]:
# save as csv file
nonblm_rb.to_csv('nonblm_rb_lyricsonly.csv', index=True)