# Web Scraping Pipeline for Ganxta Zolee és a Kartell Lyrics

In [2]:
import requests
from bs4 import BeautifulSoup as bs
import time

def parse_url(url):
  page = requests.get(url).text
  return bs(page)


def get_album_urls(band_url):
  '''
  Returns urls of band's albums
  '''
  album_urls = []
  
  try:
    band_soup = parse_url(band_url)
    albums = band_soup.find_all(class_ = 'thumbnail-large-text')
  except:
    print(f'Cannot parse: {band_url}')
    return None
  
  album_urls = ['http://www.zeneszoveg.hu/' + album.find('p').find('a')['href'] for album in albums]
  return album_urls


def get_track_urls(album_urls):
  '''
  Returns dict of band albums as keys and urls of tracks as values
  '''
  track_urls = {}
  
  for album_url in album_urls:
    
    try:
      album_soup = parse_url(album_url)
      tracks = album_soup.find_all('table')[2].find_all(class_ = 'track-name')
    except:
      print(f"Cannot parse album: {album_url}")
      return None
    
    track_urls[album_url]= ['http://www.zeneszoveg.hu/' + track.find('a')['href'] for track in tracks]
    time.sleep(2)

  return track_urls


def get_lyrics(track_url):
  '''
  Returns lyrics str from track url
  '''
  try:
    track_soup = parse_url(track_url)
    lyrics = track_soup.find(class_ = 'lyrics-plain-text').text
  except:
    print(f'"Cannot parse track: {track_url}')
    return None
  
  time.sleep(2)
  return lyrics


def collect_lyrics(track_urls_dict):
  '''
  Pipeline to collect lyrics by album-tracks dict
  Returns nested dict
  '''
  corpus = {}
  lyrics_dict = {}
  
  for album_url, track_urls in track_urls_dict.items():
    if len(track_urls) > 0:
      for track_url in track_urls:
        lyrics = get_lyrics(track_url)
        lyrics_dict[track_url] = lyrics
    corpus[album_url] = lyrics_dict
  
  return corpus

In [3]:
album_urls = get_album_urls('http://www.zeneszoveg.hu/egyuttes/4/ganxsta-zolee-es-a-kartel-dalszovegei.html')
track_urls = get_track_urls(album_urls)

In [None]:
c = collect_lyrics(track_urls)

In [None]:
import pandas as pd

album_titles = []
helper = []

for album_title, tracks in c.items():
    album_titles.append(album_titles)
    helper.append(pd.DataFrame.from_dict(tracks, orient='index'))

res = pd.concat(helper, keys=album_titles)