In [1]:
import pandas as pd
import numpy as np

import re

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from bs4 import BeautifulSoup

In [2]:
my_tracks = pd.read_csv('my_tracks.csv')

In [3]:
my_tracks[['track','artist']].head()

Unnamed: 0,track,artist
0,DEJA VU - Extended ver.,dave rodgers
1,Dead Man's Party,Oingo Boingo
2,"Bands (feat. Ohgeesy, Fenix Flexin & Master Kato)",Shoreline Mafia
3,Turn Away the Bad Thing,Ceremony
4,Harvey,Her's


In [4]:
def trackClean(string):
    #removes common and unique patterns found in track names
    #will ensure consistent naming convention for searching genius API
    
    return re.sub('(( \(feat.*$)|( \-.*$)|( \(Narcos.*$)|( \(with Dev.*$)|( \(Version Ska.*$))','',string)

In [5]:
#arrays containing track and artist names
songs = my_tracks['track'].apply(trackClean)
artists = my_tracks['artist']

In [8]:
def getLyrics(song_title, artist_name):
    
    '''
    PURPOSE | Searches Genius API for matches based on inputted song and artist information.
              If a match is found, function will scrape Genius.com for the songs lyrics
              
    INPUTS  | song_title (str): title of song
              artist_name (str): name of artist
    '''
    #requests song information from Genius API
    
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + 'OFWLAZns_aeQYHap8mBbDD7Y7eGLtLV9z4UyQi0pd7pGUJ4aZ4hX-8kxTl1TAZdm'}
    search_url = base_url + '/search'
    data = {'q': song_title + ' ' + artist_name}
    
    #work around for sending so many requests to the API
    #lets me wait, then retry the request
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    #api response
    response = requests.get(search_url, data=data, headers=headers)
    
    # Search for matches in the request response   
    json = response.json()
    remote_song_info = None
    
    #searching for a match
    for hit in json['response']['hits']:
        if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
            remote_song_info = hit
            break

    #get lyrics from URL if the song was found
    if remote_song_info:
        song_url = remote_song_info['result']['url']
        
        #scrapes song lyrics from genius url
        page = requests.get(song_url)
        html = BeautifulSoup(page.text, 'html.parser')
        
        #return lyrics
        return html.find('div', class_='lyrics').get_text()
    
    else:
        return np.nan

In [9]:
lyrics = []

for song, artist in zip(songs,artists):
    lyrics.append(getLyrics(song, artist))

In [11]:
for ind, val in enumerate(lyrics):
    
    #nan values remain the same
    if type(val) == float:
        lyrics[ind] = val
    
    #removing special characters and info within brackets from lyrics
    #adding a space at end of line anchor
    else:
        val = re.sub('((\\\\n)|(\\\\u.+?\d+)|(\[.+?\]))','', val)
        lyrics[ind] = re.sub('(\\n)',' ', val).strip()

In [12]:
lyrics[6]

'Yo soy un hombre sincero de onde las calles Y creiro gritar mis versos del alma Hasta La muerte Bebi suficiente palos dos Olvidar nuestro pasado y dejar estos erros astras Recuerdos en la botella Es una cosa que nunca he dicho que estoy Verdaderamente felix en mi alma y mi cabeza Un higado suspendidod en liquido Hasta la muerte Botellas en el aire, lado ah lado Hasta la muerte Te amo, hermanos Son noches como estas, que me levanto a los pies a la tierra Recordando esa sonrisa, no estoy tan solo Hasta la muerte Botellas en el aire, lado ah lado Hasta la muerte, te amo, harmanos Esta es mi vida y me vale lo que peinsan la gente Para mi gente, botellas en el aire Te amo, a toda madre o un desmadre Son noches como estas, que me levanto la cabeza y sourio Me olvido de los recuerdos que tocan como disco rayado Hasta la muerte Botellas en el aire, lado ah lado Hasta la muerte Te amo, hermanos'

In [13]:
my_lyrics = pd.DataFrame()

my_lyrics['track'] = my_tracks['track']
my_lyrics['artist'] = artists
my_lyrics['lyrics'] = lyrics


In [21]:
my_lyrics.dropna(inplace=True)

In [23]:
my_lyrics.reset_index(drop = True, inplace = True)

In [24]:
my_lyrics.head()

Unnamed: 0,track,artist,lyrics
0,DEJA VU - Extended ver.,dave rodgers,See your body into the moonlight Even if I try...
1,Dead Man's Party,Oingo Boingo,I'm all dressed up with nowhere to go Walkin' ...
2,"Bands (feat. Ohgeesy, Fenix Flexin & Master Kato)",Shoreline Mafia,"AceTheFace This ain't a Milly Rock, this a m..."
3,Turn Away the Bad Thing,Ceremony,I'll say It's getting harder for me to be al...
4,No One Lives Forever,Oingo Boingo,You worry too much You make yourself sad You c...


In [25]:
my_lyrics.to_csv('my_lyrics.csv', index = False)