# Web Scraping Pipeline for Ganxta Zolee és a Kartell Lyrics

In [98]:
import requests
from bs4 import BeautifulSoup as bs
import time
from tqdm.notebook import tqdm

def parse_url(url):
  page = requests.get(url).text
  return bs(page)


def get_album_urls(band_url):
  '''
  Returns urls of band's albums
  '''
  album_urls = []
  
  try:
    band_soup = parse_url(band_url)
    albums = band_soup.find_all(class_ = 'thumbnail-large-text')
  except:
    print(f'Cannot parse: {band_url}')
    return None
  
  album_urls = ['http://www.zeneszoveg.hu/' + album.find('p').find('a')['href'] for album in albums]
  return album_urls


def get_track_urls(album_urls):
  '''
  Returns dict of band albums as keys and urls of tracks as values
  '''
  track_urls = {}
  
  for album_url in tqdm(album_urls):
    
    try:
      album_soup = parse_url(album_url)
      tracks = album_soup.find_all('table')[2].find_all(class_ = 'track-name')
    except:
      print(f"Cannot parse album: {album_url}")
      return None
    
    track_urls[album_url]= ['http://www.zeneszoveg.hu/' + track.find('a')['href'] for track in tracks]
    time.sleep(2)

  return track_urls


def get_lyrics(track_url):
  '''
  Returns lyrics str from track url
  '''
  try:
    track_soup = parse_url(track_url)
    lyrics = track_soup.find(class_ = 'lyrics-plain-text').text
  except:
    print(f'"Cannot parse track: {track_url}')
    return None
  
  time.sleep(2)
  return lyrics


def collect_lyrics(track_urls_dict):
  '''
  Pipeline to collect lyrics by album-tracks dict
  Returns nested dict
  '''
  corpus = {}
  
  for album_url, track_urls in tqdm(track_urls_dict.items()):
    if len(track_urls) > 0:
      lyrics_dict = {}
      for track_url in tqdm(track_urls):
        lyrics = get_lyrics(track_url)
        lyrics_dict[track_url] = lyrics
    corpus[album_url] = lyrics_dict
  
  return corpus

In [99]:
album_urls = get_album_urls('http://www.zeneszoveg.hu/egyuttes/4/ganxsta-zolee-es-a-kartel-dalszovegei.html')

In [100]:
track_urls = get_track_urls(album_urls)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [101]:
c = collect_lyrics(track_urls)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))





In [102]:
import pandas as pd

album_titles = []
helper = []

for album_title, tracks in tqdm(c.items()):
    album_titles.append(album_title)
    helper.append(pd.DataFrame.from_dict(tracks, orient='index'))

res = pd.concat(helper, keys=album_titles)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [103]:
res = res.reset_index()
res.columns = ['album', 'track', 'lyrics']
res['album'] = res['album'].apply(lambda x: re.search('/([a-z0-9\-]*)-album.html', x).groups()[0])
res['track'] = res['track'].apply(lambda x: re.search('/([a-z0-9\-]*)-zeneszoveg.html', x).groups()[0])
res['lyrics'] = res['lyrics'].apply(lambda x: ' '.join(x.split()))

In [107]:
#res.to_csv('./data/gxta-zl.csv', index = False)