In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bs4
import requests
import warnings
import re
import html
import unidecode

from matplotlib import rcParams

rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = 'arial'

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
blm_playlist = pd.read_csv('BLM.csv')
blm_playlist

Unnamed: 0,song_name,artist_name,song_type
0,ye,burna boy,hip-hop
1,ole,tiwa savage,hip-hop
2,boxed in,sainvil,r&b
3,sorrow tears and blood,fela kuti,r&b
4,no more teardrops,vic mensa,hip-hop
5,by any means,jorja smith,r&b
6,black woman,emanuel (cellulardada),r&b
7,"to be young, gifted and black",donny hathaway,r&b
8,dangote,burna boy,hip-hop
9,somethings gotta give,sonna rele,r&b


In [3]:
# categorise songs based on their genres
blm_hiphop = blm_playlist[blm_playlist['song_type']=='hip-hop']
blm_rb = blm_playlist[blm_playlist['song_type']=='r&b']

In [4]:
blm_hiphop

Unnamed: 0,song_name,artist_name,song_type
0,ye,burna boy,hip-hop
1,ole,tiwa savage,hip-hop
4,no more teardrops,vic mensa,hip-hop
8,dangote,burna boy,hip-hop
12,i need you to,tobe nwigwe,hip-hop
13,this is america,childish gambino,hip-hop
14,alright,kendrick lamar,hip-hop
16,the bigger picture,lil baby,hip-hop
17,they dont care,lloyd,hip-hop
18,dont shoot,the game,hip-hop


In [5]:
# clean song titles and artist names
def clean_song(x):
    x = x.replace('-', ' ')
    x = re.sub(r'[^\w\s]','', x)
    x = unidecode.unidecode(x)
    return x

blm_hiphop[['song_name']]\
    = blm_hiphop[['song_name']].applymap(
        lambda x: x.replace('&', 'and'))

blm_hiphop[['song_name', 'artist_name']]\
    = blm_hiphop[['song_name', 'artist_name']].applymap(clean_song)

In [6]:
lyrics_list = list() # collector for lyric strings or None if no lyrics found
source_list = list() # collector for the URL from which lyrics were obtained

for index, row in blm_hiphop.iterrows():
    artist = row['artist_name']
    song = row['song_name']
    source = None
    
    # generate URLs for each website, using dashes/underscores in place of whitespaces
    urls = list()
    metro_url = 'http://metrolyrics.com/{}-lyrics-{}.html'.format(
        song.replace(' ', '-'), artist.replace(' ', '-'))
    mode_url = 'http://www.lyricsmode.com/lyrics/{}/{}/{}.html'.format(
        artist[0], artist.replace(' ', '_'), song.replace(' ', '_'))
    genius_url = 'https://genius.com/{}-{}-lyrics'.format(
        artist.replace(' ', '-'), song.replace(' ', '-'))
    urls.extend([genius_url, metro_url, mode_url])

    # lyrics collected as list of tokens
    for url in urls:
        try:
            # request HTML and parse
            html = requests.get(url=url).content
            soup = bs4.BeautifulSoup(html)
            
            # find lyrics and pre-process
            if 'genius.com' in url:
                print(url)
                lyrics = soup\
                    .find('div', {'class': 'lyrics'})\
                    .find('p')\
                    .findAll(text=True)
                if lyrics is None:
                    raise Exception
                source = 'genius.com'
            
            if 'lyricsmode' in url:
                print(url)
                lyrics = soup.find('p', {'id': 'lyrics_text'})
                if lyrics is None:
                    raise Exception
                lyrics = lyrics.text.split()
                source = 'lyricsmode.com'
            if 'metrolyrics' in url:
                print(url)
                lyrics = soup.find('div', {'id': 'lyrics-body-text'})
                if lyrics is None:
                    raise Exception
                lyrics = [line.findAll(text=True)\
                          for line in soup.findAll('p', {'class': 'verse'})]
                lyrics = [item for sublist in lyrics for item in sublist]
                source = 'metrolyrics.com'
            
            break
        except:
            print('error')
    lyrics_list.append(lyrics)
    source_list.append(source)

https://genius.com/burna-boy-ye-lyrics
https://genius.com/tiwa-savage-ole-lyrics
https://genius.com/vic-mensa-no-more-teardrops-lyrics
https://genius.com/burna-boy-dangote-lyrics
https://genius.com/tobe-nwigwe-i-need-you-to-lyrics
https://genius.com/childish-gambino-this-is-america-lyrics
https://genius.com/kendrick-lamar-alright-lyrics
error
http://metrolyrics.com/alright-lyrics-kendrick-lamar.html
https://genius.com/lil-baby-the-bigger-picture-lyrics
https://genius.com/lloyd-they-dont-care-lyrics
error
http://metrolyrics.com/they-dont-care-lyrics-lloyd.html
https://genius.com/the-game-dont-shoot-lyrics
https://genius.com/anderson-paak-lockdown-lyrics
error
http://metrolyrics.com/lockdown-lyrics-anderson-paak.html
https://genius.com/usher-i-cry-lyrics
https://genius.com/wale-june-5th-queenzngodz-lyrics
https://genius.com/buddy-black-lyrics
error
http://metrolyrics.com/black-lyrics-buddy.html
https://genius.com/tee-grizzley-mr-officer-lyrics
https://genius.com/meek-mill-otherside-of-am

In [7]:
def clean_lyrics(lyrics):
    if lyrics is None:
        return lyrics
    
    # combine lists of tokens into single string
    lyrics = ' '.join(lyrics)
            
    # remove apostrophes
    lyrics = lyrics.replace('\'', '')
            
    # remove song structure tags or instructions in brackets
    lyrics = re.sub(r'[\*\[|\(|\{].*\n*.*[\]\)\}\*]' , ' ', lyrics)
            
    # replace all punctuations with spaces
    lyrics = re.sub(r'[^\w\s]', ' ', lyrics)
            
    # replace consecutive whitespaces with single space
    lyrics = re.sub(r'\s+', ' ', lyrics)
    
    # convert all tokens to lowercase
    lyrics = lyrics.lower()
    
    return lyrics

In [8]:
# clean and get lyrics
lyrics_list_clean = list(map(clean_lyrics, lyrics_list))
blm_hiphop['lyrics'] = lyrics_list_clean
lyrics = blm_hiphop['lyrics'].dropna()
lyrics

0      yeah my nigga die for nothing ah my nigga die...
1     intro ole ole ole ole ole ole ole ajibole ole ...
4      no more teardrops no more teardrops no more t...
8      dangote dangote dangote still dey find money ...
12     i need you to arrest the killers of breonna t...
13     yeah yeah yeah yeah yeah yeah yeah yeah go go...
14    alls my life i has to fight nigga alls my life...
16     last night people protesting in minneapolis e...
17                     and i fell they dont care enough
18     our lord grant us good in this world and good...
19    you shouldve been downtown we thought it was a...
21     hey yeah yeah yeah yeah yeah yeah yeah hey ye...
22     yeah look look might as well loop that lil bi...
23    black black black black black on black black y...
27     helluva made this beat baby mister officer mi...
28    what do you have to lose youre living in pover...
29     just when i thought it wouldnt get no sicker ...
30     here we go luda the worlds screwed and ni

In [9]:
blm_hiphop

Unnamed: 0,song_name,artist_name,song_type,lyrics
0,ye,burna boy,hip-hop,yeah my nigga die for nothing ah my nigga die...
1,ole,tiwa savage,hip-hop,intro ole ole ole ole ole ole ole ajibole ole ...
4,no more teardrops,vic mensa,hip-hop,no more teardrops no more teardrops no more t...
8,dangote,burna boy,hip-hop,dangote dangote dangote still dey find money ...
12,i need you to,tobe nwigwe,hip-hop,i need you to arrest the killers of breonna t...
13,this is america,childish gambino,hip-hop,yeah yeah yeah yeah yeah yeah yeah yeah go go...
14,alright,kendrick lamar,hip-hop,alls my life i has to fight nigga alls my life...
16,the bigger picture,lil baby,hip-hop,last night people protesting in minneapolis e...
17,they dont care,lloyd,hip-hop,and i fell they dont care enough
18,dont shoot,the game,hip-hop,our lord grant us good in this world and good...


In [10]:
blm_hiphop.columns

Index(['song_name', 'artist_name', 'song_type', 'lyrics'], dtype='object')

In [11]:
# remove other elements, only lyrics are remianed
to_drop = ['song_name',
           'artist_name',
           'song_type']
blm_hiphop.drop(columns=to_drop, inplace=True) 

In [12]:
# save as csv file
blm_hiphop.to_csv('blm_hiphop_lyricsonly.csv', index=True)

In [13]:
blm_rb

Unnamed: 0,song_name,artist_name,song_type
2,boxed in,sainvil,r&b
3,sorrow tears and blood,fela kuti,r&b
5,by any means,jorja smith,r&b
6,black woman,emanuel (cellulardada),r&b
7,"to be young, gifted and black",donny hathaway,r&b
9,somethings gotta give,sonna rele,r&b
10,real black,"kevin ross, trevor jackson and jacob latimore",r&b
11,2000 blacks got to be free,fela kuti,r&b
15,black parade,beyonce,r&b
20,i cant breathe,h.e.r,r&b


In [52]:
# clean song titles and artist names
def clean_song(x):
    x = x.replace('-', ' ')
    x = re.sub(r'[^\w\s]','', x)
    x = unidecode.unidecode(x)
    return x

blm_rb[['song_name']]\
    = blm_rb[['song_name']].applymap(
        lambda x: x.replace('&', 'and'))

blm_rb[['song_name', 'artist_name']]\
    = blm_rb[['song_name', 'artist_name']].applymap(clean_song)

In [53]:
lyrics_list = list() # collector for lyric strings or None if no lyrics found
source_list = list() # collector for the URL from which lyrics were obtained

for index, row in blm_rb.iterrows():
    artist = row['artist_name']
    song = row['song_name']
    source = None
    
    # generate URLs for each website, using dashes/underscores in place of whitespaces
    urls = list()
    metro_url = 'http://metrolyrics.com/{}-lyrics-{}.html'.format(
        song.replace(' ', '-'), artist.replace(' ', '-'))
    mode_url = 'http://www.lyricsmode.com/lyrics/{}/{}/{}.html'.format(
        artist[0], artist.replace(' ', '_'), song.replace(' ', '_'))
    genius_url = 'https://genius.com/{}-{}-lyrics'.format(
        artist.replace(' ', '-'), song.replace(' ', '-'))
    urls.extend([genius_url, mode_url, metro_url,])

    # lyrics collected as list of tokens
    for url in urls:
        try:
            # request HTML and parse
            html = requests.get(url=url).content
            soup = bs4.BeautifulSoup(html)
            
            # find lyrics and pre-process
            if 'genius.com' in url:
                print(url)
                lyrics = soup\
                    .find('div', {'class': 'lyrics'})\
                    .find('p')\
                    .findAll(text=True)
                if lyrics is None:
                    raise Exception
                source = 'genius.com'
            
            if 'lyricsmode' in url:
                print(url)
                lyrics = soup.find('p', {'id': 'lyrics_text'})
                if lyrics is None:
                    raise Exception
                lyrics = lyrics.text.split()
                source = 'lyricsmode.com'
            if 'metrolyrics' in url:
                print(url)
                lyrics = soup.find('div', {'id': 'lyrics-body-text'})
                if lyrics is None:
                    raise Exception
                lyrics = [line.findAll(text=True)\
                          for line in soup.findAll('p', {'class': 'verse'})]
                lyrics = [item for sublist in lyrics for item in sublist]
                source = 'metrolyrics.com'
            
            break
        except:
            print('error')
    lyrics_list.append(lyrics)
    source_list.append(source)

https://genius.com/sainvil-boxed-in-lyrics
https://genius.com/fela-kuti-sorrow-tears-and-blood-lyrics
https://genius.com/jorja-smith-by-any-means-lyrics
https://genius.com/emanuel-cellulardada-black-woman-lyrics
https://genius.com/donny-hathaway-to-be-young-gifted-and-black-lyrics
https://genius.com/sonna-rele-somethings-gotta-give-lyrics
https://genius.com/kevin-ross-trevor-jackson-and-jacob-latimore-real-black-lyrics
https://genius.com/fela-kuti-2000-blacks-got-to-be-free-lyrics
error
http://www.lyricsmode.com/lyrics/f/fela_kuti/2000_blacks_got_to_be_free.html
error
http://metrolyrics.com/2000-blacks-got-to-be-free-lyrics-fela-kuti.html
https://genius.com/beyonce-black-parade-lyrics
error
http://www.lyricsmode.com/lyrics/b/beyonce/black_parade.html
error
http://metrolyrics.com/black-parade-lyrics-beyonce.html
https://genius.com/her-i-cant-breathe-lyrics
error
http://www.lyricsmode.com/lyrics/h/her/i_cant_breathe.html
error
http://metrolyrics.com/i-cant-breathe-lyrics-her.html
https:/

In [54]:
# clean and get lyrics
lyrics_list_clean = list(map(clean_lyrics, lyrics_list))
blm_rb['lyrics'] = lyrics_list_clean
lyrics = blm_rb['lyrics'].dropna()
lyrics

2      they say the youth is so lost when the shape ...
3      eh ya everybody run run run eh ya everybody s...
5      so go ahead and fix your crown then watch it ...
6      cocoa butter kisses wipe my tears away i wann...
7      to be young gifted and black oh what a lovely...
9      i cant breathe i cant breathe my voice tired ...
10     yeah check it yeah real black tryna let the n...
11    podría invitarte a un restaurante que no sea m...
15     im goin back to the south im goin back back b...
20    started a war screaming peace at the same time...
24    james brown now we demand a chance to do thing...
25    tryna rain tryna rain on the thunder tell the ...
26     and we just tryna make a way stomach hungry k...
33    you will not be able to stay home brother you ...
34    i well well well well well well well well well...
37     i just wanna live god protect me im a young b...
38     hmm i just gotta get some things off my chest...
Name: lyrics, dtype: object

In [56]:
blm_rb.columns

Index(['song_name', 'artist_name', 'song_type', 'lyrics'], dtype='object')

In [57]:
# remove other elements, only lyrics are remianed
to_drop = ['song_name',
           'artist_name',
           'song_type']
blm_rb.drop(columns=to_drop, inplace=True) 

In [58]:
blm_rb.to_csv('blm_rb_lyricsonly.csv', index=True)