# Webscraping Bollywood songs from 1947-2018

We'll be scraping https://www.lyricsia.com/

## Phase 1

In [1]:
import requests
import bs4 as bs
import requests
import csv
import time

In [2]:
# Opening the csv file in write mode and adding a header
with open(r'lyricsia_phase1.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['SNo', 'Songpage_URL', 'Movie'])

In [3]:
year_page_url = 'https://www.lyricsia.com/hindi-songs/'
root_url = 'https://www.lyricsia.com'

In [4]:
%%time
rowcount = 0

# The category pages are ordered and range from 1947 to 2018
for year in range(1947, 2019):
    
    url = year_page_url + str(year)
    
    try:
        # Fetching the page
        source = requests.get(url)
        
        # Parsing the html
        webpage = bs.BeautifulSoup(source.content, features='html.parser') 
    
        listofsongs = []
        for index, row in enumerate(webpage.findAll('tr')):
            # The first row contains the header
            if index != 0:
                songurl = root_url + row.select('td')[1].a['href'].split(';')[0]
                moviename = row.select('td')[2].text
                rowcount += 1
                
                listofsongs.append([rowcount, songurl, moviename])
                
        # Opening the file in append mode and adding the data for the entire page i.e. each year
        with open(r'lyricsia_phase1.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerows(listofsongs)
            
        # Printing the progress
        print(f'Data Scraped for year: {year}')
    except Exception as e:
        print(e)

Data Scraped for year: 1947
Data Scraped for year: 1948
Data Scraped for year: 1949
Data Scraped for year: 1950
Data Scraped for year: 1951
Data Scraped for year: 1952
Data Scraped for year: 1953
Data Scraped for year: 1954
Data Scraped for year: 1955
Data Scraped for year: 1956
Data Scraped for year: 1957
Data Scraped for year: 1958
Data Scraped for year: 1959
Data Scraped for year: 1960
Data Scraped for year: 1961
Data Scraped for year: 1962
Data Scraped for year: 1963
Data Scraped for year: 1964
Data Scraped for year: 1965
Data Scraped for year: 1966
Data Scraped for year: 1967
Data Scraped for year: 1968
Data Scraped for year: 1969
Data Scraped for year: 1970
Data Scraped for year: 1971
Data Scraped for year: 1972
Data Scraped for year: 1973
Data Scraped for year: 1974
Data Scraped for year: 1975
Data Scraped for year: 1976
Data Scraped for year: 1977
Data Scraped for year: 1978
Data Scraped for year: 1979
Data Scraped for year: 1980
Data Scraped for year: 1981
Data Scraped for yea

## Phase 2

In [2]:
import requests
import bs4 as bs
import requests
import csv
import time
import pandas as pd

In [3]:
# Opening the data scraped in phase 1
df = pd.read_csv('lyricsia_phase1.csv')

In [4]:
df.head()

Unnamed: 0,SNo,Songpage_URL,Movie
0,1,https://www.lyricsia.com/lyricid/5902/yahaan-b...,Jugnu
1,2,https://www.lyricsia.com/lyricid/5098/chand-si...,Elan
2,3,https://www.lyricsia.com/lyricid/5050/wo-apani...,Jugnu
3,4,https://www.lyricsia.com/lyricid/5048/yahan-ba...,Jugnu
4,5,https://www.lyricsia.com/lyricid/5082/khayegi-...,Mirza Sahiban


In [8]:
tableheaders = ['Song', 'Lyrics', 'Movie', 'Year', 'Singer', 'Writer', 'Composer']

In [9]:
# Creating a new file by opening it in write mode
with open(r'lyricsia_phase2.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(tableheaders)

In [5]:
%%time

# Each row item contains the link to a song page
for i in range(len(df)):
    # Extracting the song page url from the row item
    songurl = df['Songpage_URL'][i]
    try:      
        
        # Getting the song page and parsing the html
        source = requests.get(songurl)
        webpage = bs.BeautifulSoup(source.content, features='html.parser') 

        # Finding various information about the song from the html 
        song = webpage.find('h1').text.split('Lyrics')[0]
        
        lyrics = webpage.find('pre').text
        
        metacontainer = webpage.find('div', attrs={'class':'col-sm-4 col-md-4'})
        
        movie = metacontainer.select('p')[1].text.split(':')[1].lstrip()
        singer = metacontainer.select('p')[2].text.split(':')[1].lstrip()
        writer = metacontainer.select('p')[3].text.split(':')[1].lstrip()
        composer = metacontainer.select('p')[4].text.split(':')[1].lstrip()
        year = metacontainer.select('p')[7].text.split(':')[1].lstrip()
        
        songdata = [song, lyrics, movie, year, singer, writer, composer]
        
        # Writing each row of song with its lyrics to csv file
        # Note that the file was opened in append mode
        with open(r'lyricsia_phase2.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerow(songdata)
            
        # Printing progress
        print(f'Added song {i+1}/{len(df)}: {song}')
        
    except Exception as e:
        print(e)

Added song 7045/7363: Piya Reprise 
Added song 7046/7363: Talli Hogiya 
Added song 7047/7363: Tu Chale Toh 
Added song 7048/7363: Yeh mera man 
Added song 7049/7363: Aaj Mile Hain 
Added song 7050/7363: Badass Babuaa 
Added song 7051/7363: Body Language 
Added song 7052/7363: Chete Karda 2 
Added song 7053/7363: I’ll Miss You 
Added song 7054/7363: Kudi Baeymaan 
Added song 7055/7363: Majaa Ni Life 
Added song 7056/7363: Nagada Nagada 
Added song 7057/7363: Party Nonstop 
Added song 7058/7363: Saajan Saajan 
Added song 7059/7363: Sardi Ki Raat 
Added song 7060/7363: Tere Jaane Se 
Added song 7061/7363: Trippy Trippy 
Added song 7062/7363: Tum Mere Baad 
Added song 7063/7363: Abhagi Piya Ki 
Added song 7064/7363: Ae Kash Ke Hum 
Added song 7065/7363: Dekhte Hi Fida 
Added song 7066/7363: Ik Yaad Purani 
Added song 7067/7363: Jahan Bhi Yaad 
Added song 7068/7363: Jatt Te Jawani 
Added song 7069/7363: Main Faraar Sa 
Added song 7070/7363: Naag The Third 
Added song 7071/7363: Pop The Bott