In [131]:
import os,sys
from bs4 import BeautifulSoup as bs
import requests
import threading
from time import sleep
import json

## Utilities

In [34]:
def cleanText(text):
    '''
        Don't do much pre processing. Important info might get lost.
    '''
    text = text.strip()
    return text

## Logging

In [128]:
DATA_PATH = "./data"
skipped_pages_log = open('./logs/skipped_files.txt','w')

## Setting up variables for multi-theading

In [121]:
NUM_MOVIE_THREADS = 4
NUM_WRITER_THREADS = 4

In [125]:
semaphore = threading.Semaphore()

## Get data from the movie list

In [106]:
url = 'https://www.lyricsmasti.com/song/7461/lyrics-of-Dil-Kashi.html'
soup = bs(requests.get(url).text,"lxml")

In [145]:
def getMovieList(soup):
    movie_list = soup.find_all('ul',class_='list-group list-group-flush')[0].find_all('a')
    view_list = soup.find_all('li',class_='list-group-item')
    movie_list_data = []
    assert len(movie_list) == len(view_list), "#movies != #views"
    for i in range(len(movie_list)):
        movie_name_list = movie_list[i].get_text().split("-")
        movie_year = -1
        if len(movie_name_list) >= 2:
            movie_year = cleanText(movie_name_list[1])
        movie_list_data.append({"url":movie_list[i]['href'],"views":view_list[i]['title'],"movie_name":cleanText(movie_list[i].get_text()),"movie_year":movie_year})
    return movie_list_data

In [67]:
def getSongList(soup):
    song_list = soup.find_all('ol',class_='custom-counter')[0].find_all('a')
    song_list_data = []
    for song in song_list:
        song_list_data.append({'song_url':song['href'],'song_name':cleanText(song.get_text())})
    return song_list_data

In [118]:
def getSongData(soup):
    song_data = {}
    lyrics = soup.find_all('code')[0].get_text()
    song_data.update({'song_lyrics':lyrics})

    song_informtion = soup.find_all('li',class_='f-w')[0].find_all('ul')[0].find_all('li')
    for info in song_informtion:
        property_val = -1
        property_name = "unk"
        if len(info.find_all('h4')) >= 1:
            #print(cleanText(info.find_all('h4')[0].get_text()))
            property_name = cleanText(info.find_all('h4')[0].get_text())
        if len(info.find_all('span')) >= 1:
            #print(cleanText(info.find_all('span')[0].get_text()))
            property_val = cleanText(info.find_all('span')[0].get_text())
        elif len(info.find_all('p')) >= 1:
            #print(cleanText(info.find_all('p')[0].get_text()))
            property_val = cleanText(info.find_all('p')[0].get_text())
        if property_name != "unk":
            song_data.update({property_name:property_val})
            
    return song_data


In [None]:
def getAllSongsFromAMovie(base_url,movie,skipped_pages_log):
    
    
    print("Processing movie: ",movie['movie_name'])
    song_page_url = base_url+movie['url']
    print("song_page_url: {}".format(song_page_url))
    try:
        song_soup = bs(requests.get(song_page_url).text,"lxml")
    except:
        skipped_pages_log.write(song_page_url)
        print("skipped song page: ",song_page_url)
        continue

    song_list_data = getSongList(song_soup)
    print(song_list_data)
    song_data_list = []
    for song in song_list_data:
        print("Processing movie: ",song['song_name'])
        song_url = base_url+song['song_url']
        print("song_url: {}".format(song_url))
        try:
            song_data_soup = bs(requests.get(song_url).text,"lxml")
        except:
            skipped_pages_log.write(song_url)
            print("skipped song page: ",song_url)
            continue
        song_data = getSongData(song_data_soup)
        print(song_data)
        song_data_list.append(song_data)
        
    #Write Data to a movie named folder
    os.mkdir(os.path.join(DATA_PATH,movie['movie_name']))
    for song in song_data_list:
        for k,v in movie.items():
            song.update({k:v})
        
        

## Data Extraction Loop

In [146]:
base_url = 'https://www.lyricsmasti.com'
num_chars = 1
for i in range(num_chars):
    movie_page_base_url = base_url+'/songs_for_movie_'+chr(ord('A')+i)+'.html'
    print("Processing: {}".format(movie_page_base_url))
    
    page_num = 0
    while True:
        if page_num > 0:
            cur_page_url = movie_page_base_url+"?page="+str(page_num)
        else:
            cur_page_url = movie_page_base_url
        print(cur_page_url)
        
        try:
            soup = bs(requests.get(cur_page_url).text,"lxml")
        except:
            skipped_pages_log.write(cur_page_url)
            print("skipped page: ",cur_page_url)
            continue
        
        #Process data
        movie_data = getMovieList(soup)
        print("No. of song movies in this page:",len(movie_data))
        #Terminating condition
        if len(movie_data) == 0:
            break
            
        for movie in movie_data:
            print("Processing movie: ",movie['movie_name'])
            song_page_url = base_url+movie['url']
            print("song_page_url: {}".format(song_page_url))
            try:
                song_soup = bs(requests.get(song_page_url).text,"lxml")
            except:
                skipped_pages_log.write(song_page_url)
                print("skipped song page: ",song_page_url)
                continue
            
            #Create a  thread here to get data for an individual movie
            song_list_data = getSongList(song_soup)
            print(song_list_data)
            song_data_list = []
            for song in song_list_data:
                print("Processing song: ",song['song_name'])
                song_url = base_url+song['song_url']
                print("song_url: {}".format(song_url))
                try:
                    song_data_soup = bs(requests.get(song_url).text,"lxml")
                except:
                    skipped_pages_log.write(song_url)
                    print("skipped song page: ",song_url)
                    continue
                song_data = getSongData(song_data_soup)
                for k,v in song.items():
                    song_data.update({k:v})
                song_data_list.append(song_data)
                #print(song_data)
            
            dir_name = os.path.join(DATA_PATH,movie['movie_name'])
            
            try:
                os.stat(dir_name)
            except:
                os.mkdir(dir_name)
                 
            for song in song_data_list:
                print("baal: ",song)
                for k,v in movie.items():
                    song.update({k:v})
                print(song)
                file_name = os.path.join(dir_name,song['song_name'])+".json"
                with open(file_name,'w') as writer:
                    json.dump(song,writer)
            break
        
        page_num += 1
        break
    

Processing: https://www.lyricsmasti.com/songs_for_movie_A.html
https://www.lyricsmasti.com/songs_for_movie_A.html
No. of song movies in this page: 20
Processing movie:  A Flat - 2010
song_page_url: https://www.lyricsmasti.com/3351/songs-of-movie-A-Flat.html
[{'song_url': '/song/7461/lyrics-of-Dil-Kashi.html', 'song_name': 'Dil Kashi'}, {'song_url': '/song/7884/lyrics-of-Meetha-Sa.html', 'song_name': 'Meetha Sa'}, {'song_url': '/song/7885/lyrics-of-Chal-Halke-Halke.html', 'song_name': 'Chal Halke Halke'}, {'song_url': '/song/7886/lyrics-of-Pyar-Itna-Na-Kar.html', 'song_name': 'Pyar Itna Na Kar'}]
Processing song:  Dil Kashi
song_url: https://www.lyricsmasti.com/song/7461/lyrics-of-Dil-Kashi.html
Processing song:  Meetha Sa
song_url: https://www.lyricsmasti.com/song/7884/lyrics-of-Meetha-Sa.html
Processing song:  Chal Halke Halke
song_url: https://www.lyricsmasti.com/song/7885/lyrics-of-Chal-Halke-Halke.html
Processing song:  Pyar Itna Na Kar
song_url: https://www.lyricsmasti.com/song/78