# Web Scraping

Ubuntu (Linux)/macOS, Jupyter Notebook, Python 3; 
Copyright: Xiaoyi (Sherry) Zhu

In [1]:
# import modules
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import re

import csv
import os

from nltk.corpus import stopwords

from selenium  import webdriver
import selenium.webdriver
from urllib2 import urlopen
#from webdriver_manager.chrome import ChromeDriverManager
#driver = webdriver.Chrome()
#driver = webdriver.Chrome(ChromeDriverManager().install())


# making web requests
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)


# [function] find a specific string/pattern between to expressions

def find_between(s, first, last):
    try:
        start = s.index( first ) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ''

def find_between_r(s, first, last):
    try:
        start = s.rindex(first) + len(first)
        end = s.rindex(last, start)
        return s[start:end]
    except ValueError:
        return ''
    

# [function] define a function to remove special characters but only keep hyphen
def removeSpecial (text):
    """
    This function is defined to remove all special characters but keep hyphens "-".
    """
    lower = text.lower()
    upper = text.upper()
    res = ''
    
    for i in range(len(text)):
        if text[i].isnumeric() or text[i].lower() != text[i].upper() or text[i] == '-' or text[i] == ' ':
            res += text[i]
    return res


In [2]:
import sys,os,os.path
sys.path.append(os.path.expanduser('~/code/eol_hsrl_python'))
os.environ['HSRL_INSTRUMENT']='gvhsrl'
os.environ['HSRL_CONFIG']=os.path.expanduser('~/hsrl_config')

## Part 1

Get rank, song, artist from Wiki - Hot 100 music from year 1959 - 2018

In [None]:
# raw_html = simple_get('https://en.wikipedia.org/wiki/Billboard_Year-End') #base url
html = BeautifulSoup(raw_html, 'html.parser')

df = pd.DataFrame()
counter = -1
    
# get links for Hot 100 music, retrieve 'Title' and 'Artist(s)'
for link in html.find_all('a'):
    if '/wiki/Billboard_Year-End_Hot_100_singles_of_' in str(link):
        counter += 1   
        sub_url = 'https://en.wikipedia.org' + str(link.get('href')) #sub url for each year
        sub_raw_html = simple_get(sub_url)
        sub_html = BeautifulSoup(sub_raw_html, 'html.parser')
        
        ranks = []
        artists = []
        songs = []
        index = 0
        if counter >= 23:
            same_rank = 1
        else:
            same_rank = 0
        
        rank_ind = 0
        
        for items in sub_html.find('table', attrs={'class':'wikitable sortable'}).find_all('td'):
            same_rank += 1 # deal with the instance with multiple artists
            
            s = ''
            for i in range(len(items.contents)):
                s = s + str(items.contents[i].encode('utf-8'))
            if '</a>' in s:
                s = find_between(s, '>', '</a>')
            
            if same_rank == 2:
                rank_ind += 1
                songs.append(s)
            elif same_rank == 3:
                artists.append(s)
                if counter >= 23:
                    same_rank = 1
                else:
                    same_rank = 0

        # create a dataset for rank, song, artist
        df = df.append(pd.DataFrame({'rank': list(range(1,rank_ind+1)), 
                                     'song': songs, 
                                     'artist': artists, 
                                     'year':[counter+1959]*rank_ind}))

# output wiki dataset
df.to_csv('data/wiki1959-2018.csv',index=False)


## Part 2
Get lyrics from https://genius.com

In [None]:
# [Round 1]

# eg., https://genius.com/Bobby-darin-dream-lover-lyrics

# print current directory
print('Path at terminal when executing this file')
print(os.getcwd() + "\n")

# read in csv file, store the original dataset as wiki_dat
wiki_dat = pd.read_csv('wiki1959-2018.csv', encoding='ISO-8859-1')
wiki_dat.head()
df = wiki_dat

# create row index
df['obsnum'] = df.index + 1

# create url by concatenating song name and artist name
df['url'] = df.apply(lambda row: ('https://genius.com/'+removeSpecial(row.artist+' '+row.song)+'-lyrics').replace(' ','-').lower(), axis=1)

df.sample(20)


In [2]:
# loop through URLs to get lyrics
start = 0
end = 6001

for i in range(start,end):
    lyrics_url=df.at[i,'url']
    try: 
        lyrics_raw_html = simple_get(lyrics_url)
        lyrics_html = BeautifulSoup(lyrics_raw_html,'html.parser')
        df.at[i,'lyrics'] = str(lyrics_html.find('div', attrs={'class':'lyrics'}).text) # only pull down text between nodes

    except:
        df.at[i,'lyrics'] = 'url not retrieved'
        pass

df.to_csv('data/lyrics_final.csv',index=False) #634/6000=10.6%

print(df.at[5611,'lyrics'])



Kid Ink, egh
I think I found a genie in a bottle baby, with a bottle shape
And she don't have no problem representing for the team

Ok it's feeling like I just hit the lotto, baby
32-26-34, wasted
Watch her do a dance then it get me in the game
Going hard in the paint what's my name what's my name girl
You know, I be missing that bom-bom when I'm away
I swear, when you shaking them pom-poms all I can say is

Oh I think that I found myself a cheerleader
She is always right there when I need her
Oh I think that I found myself a cheerleader
She is always right there when I need her

When I need motivation
My one solution is my queen
Cause she' stay strong (yeah yeah)
She is always in my corner
Right there when I want her
All these other girls are tempting
But I'm empty when you're gone
And they say

Do you need me
Do you think I'm pretty
Do I make you feel like cheating?
And I'm like no, not really cause

Oh I think that I found myself a cheerleader
She 

In [None]:
# [Round 2] Deal with multiple singers


## Part 3
Get genre/ song type

In [3]:
# e.g., https://www.google.com/search?q=7+rings+ariana+grande+genre
# e.g., https://en.wikipedia.org/wiki/The_Battle_of_New_Orleans

# read in csv file
lyrics_dat = pd.read_csv('data/lyrics_final.csv', encoding='ISO-8859-1')
df = lyrics_dat

In [4]:
# create url

df['wikiURL'] = df.apply(lambda row: ('https://en.wikipedia.org/wiki/'+removeSpecial(row.song)).replace(' ','_'), axis=1)
df.sample(5)

Unnamed: 0,artist,rank,song,year,obsnum,url,lyrics,wikiURL
5340,Jay-Z,40,Niggas in Paris,2012,5341,https://genius.com/jay-z-niggas-in-paris-lyrics,url not retrieved,https://en.wikipedia.org/wiki/Niggas_in_Paris
2141,The Pretenders,41,Brass in Pocket,1980,2142,https://genius.com/the-pretenders-brass-in-poc...,\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\r\nGot brass ...,https://en.wikipedia.org/wiki/Brass_in_Pocket
5997,Rae Sremmurd,97,Powerglide,2018,5998,https://genius.com/rae-sremmurd-powerglide-lyrics,\r\r\n\r\r\n[Intro: Swae Lee & Juicy J]\r\r\nY...,https://en.wikipedia.org/wiki/Powerglide
1262,Marvin Gaye,62,Mercy Mercy Me (The Ecology),1971,1263,https://genius.com/marvin-gaye-mercy-mercy-me-...,\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\r\r\n[Verse...,https://en.wikipedia.org/wiki/Mercy_Mercy_Me_T...
1634,America,34,Sister Golden Hair,1975,1635,https://genius.com/america-sister-golden-hair-...,\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r\r\r\nWell I...,https://en.wikipedia.org/wiki/Sister_Golden_Hair


In [5]:
start = 0
end = 100

for i in range(start,end):
    url=str(df.at[i,'wikiURL'])
    try: 
        raw_html = simple_get(url)
        html = BeautifulSoup(raw_html,'html.parser')
        df.at[i,'genre'] = str(html.find('td', attrs={'class':'category hlist'}).text)
        
    except:
        df.at[i,'genre'] = 'wiki page not retrieved'
        pass

In [8]:
df.head(5)

Unnamed: 0,artist,rank,song,year,obsnum,url,lyrics,wikiURL,genre
0,Johnny Horton,1,The Battle of New Orleans,1959,1,https://genius.com/johnny-horton-the-battle-of...,\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r...,https://en.wikipedia.org/wiki/The_Battle_of_Ne...,Country
1,Bobby Darin,2,Mack the Knife,1959,2,https://genius.com/bobby-darin-mack-the-knife-...,\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r...,https://en.wikipedia.org/wiki/Mack_the_Knife,Moritat
2,Lloyd Price,3,Personality,1959,3,https://genius.com/lloyd-price-personality-lyrics,\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r...,https://en.wikipedia.org/wiki/Personality,wiki page not retrieved
3,Frankie Avalon,4,Venus,1959,4,https://genius.com/frankie-avalon-venus-lyrics,\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r...,https://en.wikipedia.org/wiki/Venus,wiki page not retrieved
4,Paul Anka,5,Lonely Boy,1959,5,https://genius.com/paul-anka-lonely-boy-lyrics,\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\n\r\r\r\r\r\r\r...,https://en.wikipedia.org/wiki/Lonely_Boy,wiki page not retrieved


## Data Cleaning

In [10]:
tm_dat = pd.read_csv('data/lyrics_final.csv', encoding='ISO-8859-1')
df = tm_dat
#tm_dat.sample(20)

df['lyrics'] = df['lyrics'].apply(lambda x: re.sub('Hook|Verse 1|Intro|Chorus', '', x).replace('[]',''))
df.sample(5)

Unnamed: 0,artist,rank,song,year,obsnum,url,lyrics
2206,Kool &amp; the Gang,6,Celebration,1981,2207,https://genius.com/kool-amp-the-gang-celebrati...,url not retrieved
2972,Daryl Hall &amp; John Oates,72,Everything Your Heart Desires,1988,2973,https://genius.com/daryl-hall-amp-john-oates-e...,url not retrieved
3556,Luther Vandross,56,Endless Love,1994,3557,https://genius.com/luther-vandross-endless-lov...,\r\r\r\r\r\r\n\r\r\r\r\r\r\n[: Luther Vandross...
4018,Sarah McLachlan,18,Angel,1999,4019,https://genius.com/sarah-mclachlan-angel-lyrics,\r\r\r\r\r\n\r\r\r\r\r\nSpend all your time wa...
4858,Bone Thugs-n-Harmony,58,I Tried,2007,4859,https://genius.com/bone-thugs-n-harmony-i-trie...,\r\r\r\r\n\r\r\r\r\n[: Wish Bone]\r\r\r\r\nBon...


In [None]:
# To-do:

# 1. more than one singer (and, &amp)

## Source:

<1> https://realpython.com/python-web-scraping-practical-introduction

<2> https://stackoverflow.com/questions/3368969/find-string-between-two-substrings
