# Webscraping 40k Hindi songs

We'll be scraping http://giitaayan.com/

### Phase 1

In Phase 1, we will only scrape the category pages to get the song page URLs for all the songs on the website.

In [1]:
from selenium import webdriver
import re
import pandas as pd
import csv
import time

In [2]:
Chrome = webdriver.Chrome
chromedriver = './chromedriver'
browser = Chrome(chromedriver)

In [3]:
# Table headers for the csv file
table_headers = ['Song', 'Film', 'Year', 'Music Director', 'Lyricist', 'Singers']

In [4]:
# Opening the file in write mode and hence creating a new file with just the headers
with open(r'hindi_lyrics_phase1.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(table_headers)

In [5]:
search_url = 'http://giitaayan.com/search.asp'
category_page_url = 'http://giitaayan.com/search.asp?fi=y&browse=Song&s='

# The website has following 28 categories
listofcategories = [
    '%23', '0-9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', \
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
]

In [6]:
scraped_data = []

In [7]:
%%time 
# Iterating over one category at a time
for category_item in listofcategories:
    
    url = f'http://giitaayan.com/search.asp?fi=y&browse=Song&s={category_item}'
    
    # Opening the category's first page
    browser.get(url)
    
    try:
        # Find the total number of pages for that category
        total_items = int(browser.find_element_by_xpath('//table[1]/tbody/tr/td/b[2]').text)
    except Exception as e:
        total_items = -1

    print(total_items)
    
    # Each page for the category has 50 rows except the last page
    for page_number in range(1, int(total_items // 50 + 1) + 1):
        
        # To reduce the load on the server, we induce a 2 second delay for every page request
        time.sleep(2)
        
        url = f'{search_url}?browse=Song&s={category_item}&PageNo={page_number}'
        browser.get(url)
        
        # Initializing the page data list
        page_data = []

        # Each page has 51 rows of which 1st row is a header row
        # We need to iterate from 2nd to 51st rows to get the information about each song
        for row_item_index in range(2, 52):
            try:
                # Extracting various information about the song from the loaded page
                lyrics_url = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[1]/span/a').get_attribute('href')
                movie_name = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[2]/a').text
                year = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[2]').text
                year = int(re.findall('\d+', year)[0])
                music_director = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[3]/a').text
                lyricist = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[4]/a').text
                singers = browser.find_element_by_xpath(f'//table[2]/tbody/tr[{row_item_index}]/td[5]/a').text

                row_item = [lyrics_url, movie_name, year, music_director, lyricist, singers]
                
                # Adding the data for one song to the list of 
                page_data.append(row_item)
            except Exception as e:
                # For the last page of each category, this exception will be encountered atleast once
                # since it contains less than 50 rows
                pass

        # Printing the progress of the scraping
        print(f'Writing {len(page_data)} lines for Category {category_item}, Page Number: {page_number}')
        
        # Writing the data for each page to the csv file
        # Notice that this time the file was opened in append mode
        with open(r'hindi_lyrics_phase1.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerows(page_data)

102
Writing 49 lines for Category %23, Page Number: 1
Writing 50 lines for Category %23, Page Number: 2
Writing 2 lines for Category %23, Page Number: 3
3
Writing 3 lines for Category 0-9, Page Number: 1
1117
Writing 46 lines for Category A, Page Number: 1
Writing 49 lines for Category A, Page Number: 2
Writing 44 lines for Category A, Page Number: 3
Writing 43 lines for Category A, Page Number: 4
Writing 50 lines for Category A, Page Number: 5
Writing 44 lines for Category A, Page Number: 6
Writing 47 lines for Category A, Page Number: 7
Writing 46 lines for Category A, Page Number: 8
Writing 48 lines for Category A, Page Number: 9
Writing 47 lines for Category A, Page Number: 10
Writing 47 lines for Category A, Page Number: 11
Writing 47 lines for Category A, Page Number: 12
Writing 44 lines for Category A, Page Number: 13
Writing 46 lines for Category A, Page Number: 14
Writing 47 lines for Category A, Page Number: 15
Writing 46 lines for Category A, Page Number: 16
Writing 48 lines