In [1]:
# -> Uncomment and run below commands in Google Colab for this script to work.
#
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
!pip install pandas
# -> Mount Google Drive for retaining scraped data.
#
from google.colab import drive
drive.mount('/content/drive')


# -> Run below command in terminal for this script to work on an Ubuntu system.
# -> Requires python3 and pip3 to be preinstalled on the system.
#
# sudo apt-get install chromium-chromedriver
# pip3 install selenium
# pip3 install pandas


# -> Function def run() is the starting point. Set ENV variable to start scraping.

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  chromium-browser chromium-browser-l10n chromium-codecs-ffmpeg-extra
Suggested packages:
  webaccounts-chromium-extension unity-chromium-extension adobe-flashplugin
The following NEW packages will be installed:
  chromium-browser chromium-browser-l10n chromium-chromedriver
  chromium-codecs-ffmpeg-extra
0 upgraded, 4 newly installed, 0 to remove and 59 not upgraded.
Need to get 75.5 MB of archives.
After this operation, 256 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-codecs-ffmpeg-extra amd64 83.0.4103.61-0ubuntu0.18.04.1 [1,119 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 chromium-browser amd64 83.0.410

In [2]:
from collections import OrderedDict
from selenium import webdriver
import pandas as pd
import time
import os

In [3]:
# Environemt value needs to be set.
ENV = None
path = None
driver = None
radio_dict_g = None
saved_file_counter = 0
url = ('http://www.wbsec.gov.in/results/panchayat_election_detailed_result?election_year=2013')

In [4]:
def setup_driver_for_colab():
    global driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome('chromedriver', options=options)
    print('Driver setup complete.....')

def setup_driver_for_ubuntu():
    global driver
    PATH_TO_CHROME_DRIVER = '/usr/bin/chromedriver'
    driver = webdriver.Chrome(PATH_TO_CHROME_DRIVER)
    print('Driver setup complete.....')

In [5]:
def wait():
    time.sleep(6)
    print('Waiting for page and elements to load.....')

In [6]:
def get_radio_button_options() -> dict:
    global driver
    global radio_dict_g
    if radio_dict_g:
        print('Global radio dictionary = {}'.format(radio_dict_g))
        return radio_dict_g
    radio_dict = OrderedDict()
    i = 0
    radio_selectors = driver.find_elements_by_xpath('//label[@class="form_label"]')
    for radio in radio_selectors:
        radio_dict[i] = {radio.text: radio.find_element_by_tag_name('input').get_attribute('value')}
        i += 1
    radio_dict_g = radio_dict
    print('Radio dictionary = {}'.format(radio_dict))
    return radio_dict

In [7]:
def select_preferred_radio_button(radio_dict: dict):
    global driver
    value = next(iter(radio_dict[2].values()))
    driver.find_element_by_css_selector('input[type="radio"][value="{}"]'.format(value)).click()
    print('Selecting radio button with name {}'.format(value))
    wait()

In [8]:
def get_word_from_string(element: str) -> list:
    word_list = []
    word = ''
    for char in element:
        if char == '\n':
            word_list.append(word)
            word = ''
        else:
            word = word + char
    word_list.append(word)
    # print('Word list generated = {}'.format(word_list))
    return word_list

In [9]:
def get_district_data() -> dict:
    global driver
    district_soup = driver.find_element_by_xpath('//select[@name="zilla_parishad"]')
    district_values = district_soup.find_elements_by_tag_name('option')
    district_dict = OrderedDict()
    for value in district_values:
        val = value.get_attribute('value')
        name = value.text
        if name != '--Select--':
            district_dict[val] = name
    print('District dictionary = {}'.format(district_dict))
    return district_dict

In [10]:
def select_district(key: str):
    global driver
    driver.find_element_by_xpath('//select[@name="zilla_parishad"]/option[@value="{}"]'.format(key)).click()
    print('Selecting district with key {}'.format(key))
    wait()

In [11]:
def get_block_data() -> dict:
    global driver
    block_soup = driver.find_element_by_xpath('//select[@name="panchayat_samity"]')
    block_values = block_soup.find_elements_by_tag_name('option')
    block_dict = OrderedDict()
    for value in block_values:
        val = value.get_attribute('value')
        name = value.text
        if name != '--Select--':
            block_dict[val] = name
    print('Block dictionary = {}'.format(block_dict))
    return block_dict

In [12]:
def select_block(key: str):
    global driver
    driver.find_element_by_xpath('//select[@name="panchayat_samity"]/option[@value="{}"]'.format(key)).click()
    print('Selecting block with key {}'.format(key))
    wait()

In [13]:
def get_gp_data() -> dict:
    global driver
    gp_soup = driver.find_element_by_xpath('//select[@name="gram_panchayat"]')
    gp_values = gp_soup.find_elements_by_tag_name('option')
    gp_dict = OrderedDict()
    for value in gp_values:
        val = value.get_attribute('value')
        name = value.text
        if name != '--Select--':
            gp_dict[val] = name
    print('GP dictionary = {}'.format(gp_dict))
    return gp_dict

In [14]:
def select_gp(key: str):
    global driver
    driver.find_element_by_xpath('//select[@name="gram_panchayat"]/option[@value="{}"]'.format(key)).click()
    print('Selecting GP with key {}'.format(key))
    wait()

In [15]:
def get_polling_data() -> dict:
    global driver
    polling_date_soup = driver.find_element_by_xpath('//select[@name="election_date"]')
    polling_date_values = polling_date_soup.find_elements_by_tag_name('option')
    polling_date_dict = OrderedDict()
    for value in polling_date_values:
        val = value.get_attribute('value')
        name = value.text
        if name != '--Select--':
            polling_date_dict[val] = name
    print('Polling date dictionary = {}'.format(polling_date_dict))
    return polling_date_dict

In [16]:
def select_polling_date(dt: str):
    global driver
    driver.find_element_by_xpath('//select[@name="election_date"]/option[@value="{}"]'.format(dt)).click()
    print('Selecting date {}'.format(dt))
    wait()

In [17]:
def submit_form():
    global driver
    driver.find_element_by_xpath('//input[@name="submit"]').click()
    print('Submitting form data.....')
    wait()

In [18]:
def get_table_data() -> dict:
    global driver
    tables = driver.find_elements_by_xpath('//table[@id="doc_table10"]')
    table_data_dict = {}
    i = 0
    for table in tables:
        table_head_string = ''
        table_head = table.find_element_by_tag_name('thead')
        table_head_rows = table_head.find_element_by_tag_name('tr')
        table_head_data = table_head_rows.find_elements_by_tag_name('th')
        for data in table_head_data:
            table_head_string = table_head_string + '{}\n'.format(data.text)
        table_body = table.find_element_by_tag_name('tbody')
        table_rows = table_body.find_elements_by_tag_name('tr')
        table_head_list = get_word_from_string(element=table_head_string[:-1])
        temp_row_list = []
        temp_row_list.append(table_head_list)
        for table_row in table_rows:
            table_datas = table_row.find_elements_by_tag_name('td')
            temp_data_list = []
            for table_data in table_datas:
                if table_data.text:
                    temp_data_list.append(table_data.text)
                else:
                    temp_data_list.append('-')
            temp_row_list.append(temp_data_list)
        table_data_dict[i] = temp_row_list
        i += 1
    print('Fetched data from table = {}'.format(table_data_dict))
    return table_data_dict

In [19]:
def save_page_to_dataframe(from_dict: dict):
    main_df = pd.DataFrame()
    for key in from_dict:
        values = from_dict[key]
        column = values[0]
        df = pd.DataFrame(values[1:], columns=column)
        df['Seat Name'] = df['Seat Name'].str.replace('\n', ' ')
        print('Formatting Seat Name column for garbage data.....')
        main_df = pd.concat([main_df, df], ignore_index=True)
        print('Concatenating dataframes.....')
    print('Returning main dataframe.....')
    return main_df

In [20]:
def save_to_csv(dataframe, filename: str) -> bool:
    global saved_file_counter
    global path
    saved_file_counter += 1
    if not os.path.isdir(path):
        os.mkdir(path)
        print('Creating a new directory at path {}.....'.format(path))
    complete_filename = '{}. {}'.format(saved_file_counter, filename)
    dataframe.to_csv(os.path.join(path, complete_filename), index=False, encoding='utf-8')
    print('Saving file with name {} to path {}'.format(complete_filename, path))
    if dataframe.empty:
        print('Found empty dataframe.....')
        print('Restarting from point of failure.....')
        return False
    else:
        return True

In [21]:
def run():
    setup_driver_for_env()
    global driver
    data = tuple()
    driver.get(url)
    wait()
    # print(driver.page_source)
    setup_path()
    select_preferred_radio_button(radio_dict=get_radio_button_options())
    district_dict = get_district_data()
    for key1 in district_dict:
        name1 = district_dict[key1]
        select_district(key=key1)
        block_dict = get_block_data()
        for key2 in block_dict:
            name2 = block_dict[key2]
            select_block(key=key2)
            gp_dict = get_gp_data()
            for key3 in gp_dict:
                name3 = gp_dict[key3]
                select_gp(key=key3)
                polling_date_dict = get_polling_data()
                for key4 in polling_date_dict:
                    dt = polling_date_dict[key4]
                    select_polling_date(dt=key4)
                    submit_form()
                    table_data_dict = get_table_data()
                    df = save_page_to_dataframe(from_dict=table_data_dict)
                    data = (key1, key2, key3)
                    print('Checkpoint values {}'.format(data))
                    status = save_to_csv(dataframe=df, 
                                         filename='{}_{}_{}_{}.csv'.format(name1, name2, name3, dt))
                    if status == False:
                        return restart_from_fail_point(data=data, check=True)
    print('Ending scraping.....')

In [22]:
def restart_from_fail_point(data: tuple, check: bool):
    setup_driver_for_env()
    if check:
        (a, b, c) = data
    global driver
    driver.get(url)
    wait()
    # print(driver.page_source)
    setup_path()
    select_preferred_radio_button(radio_dict=get_radio_button_options())
    district_dict = get_district_data()
    if check:
        district_dict = remove_from(dict_value=district_dict, till=a, gp=False)
    for key1 in district_dict:
        name1 = district_dict[key1]
        select_district(key=key1)
        block_dict = get_block_data()
        if check:
            block_dict = remove_from(dict_value=block_dict, till=b, gp=False)
        for key2 in block_dict:
            name2 = block_dict[key2]
            select_block(key=key2)
            gp_dict = get_gp_data()
            if check:
                gp_dict = remove_from(dict_value=gp_dict, till=c, gp=True)
                if not gp_dict:
                    check = False
                    continue
            for key3 in gp_dict:
                name3 = gp_dict[key3]
                select_gp(key=key3)
                polling_date_dict = get_polling_data()
                for key4 in polling_date_dict:
                    dt = polling_date_dict[key4]
                    select_polling_date(dt=key4)
                    submit_form()
                    table_data_dict = get_table_data()
                    df = save_page_to_dataframe(from_dict=table_data_dict)
                    datas = (key1, key2, key3)
                    print('Checkpoint values {}'.format(datas))
                    status = save_to_csv(dataframe=df, 
                                         filename='{}_{}_{}_{}.csv'.format(name1, name2, name3, dt))
                    if status == False:
                        return restart_from_fail_point(data=datas, check=True)
                    else: 
                        check = False
    print('Ending scraping.....')

In [23]:
def remove_from(dict_value: dict, till: str, gp: bool = False) -> dict:
    temp = dict_value.copy()
    for key in temp:
        if key == till:
            if gp == True:
                dict_value.pop(key, '')
            print('LEFT OVER DATA AFTER REMOVING {}'.format(dict_value))
            return dict_value
        else:
            dict_value.pop(key, '')

In [24]:
def setup_driver_for_env():
    global ENV
    if ENV == 'colab':
        # Driver setup for Google Colab.
        setup_driver_for_colab()
        print('Setting environment for {}'.format(ENV))
    elif ENV == 'ubuntu':
        # Driver setup for Ubuntu system.
        setup_driver_for_ubuntu()
        print('Setting environment for {}'.format(ENV))
    else:
        print('Please set the ENV variable to a valid environment. Allowed values are colab and ubuntu')
        print('Please provide a valid ENV value')
        return

In [25]:
def setup_path():
    global path
    global ENV
    if ENV == 'colab':
        # Path variable for Googlo Colab.
        dir_name = next(iter(get_radio_button_options()[2].keys()))
        path = os.path.join(os.getcwd(), 'drive/My Drive', dir_name)
    if ENV == 'ubuntu':
        # Path variable for Ubuntu System.
        dir_name = next(iter(get_radio_button_options()[2].keys()))
        path = os.path.join(os.getcwd(), dir_name)
    print('Selecting path {} for environment {}'.format(path, ENV))

In [None]:
# Execute this block when running the script for the very first time.
# To resume an existing progress, use the block of code in the cell below.
# ENV variable should be set to a valid option. Possible values are: 'colab', 'ubuntu'.
# 
global ENV
ENV = 'colab'

run()

In [26]:
# Execute this block only in the case of a runtime deallocation or to start scraping from a checkpoint value.
# To execute this block, values for data tuple will be in the format of ('0', '0', '0').
# To resume progress, take the last checkpoint values from the log and put in the data tuple.
# Also change the global saved_file_counter to the value of last file number saved (Check file number from files saved to Google Drive)
# ENV variable should be set to a valid option. Possible values are: 'colab', 'ubuntu'.
#
# global ENV
# ENV = 'colab'
# 
# global saved_file_counter
# saved_file_counter = 0
# 
# data = ('0', '0', '0')
# 
# restart_from_fail_point(data=data, check=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Formatting Seat Name column for garbage data.....
Concatenating dataframes.....
Formatting Seat Name column for garbage data.....
Concatenating dataframes.....
Formatting Seat Name column for garbage data.....
Concatenating dataframes.....
Formatting Seat Name column for garbage data.....
Concatenating dataframes.....
Formatting Seat Name column for garbage data.....
Concatenating dataframes.....
Returning main dataframe.....
Checkpoint values ('16', '310', '1240')
Saving file with name 2916. South 24-Parganas_SONARPUR_BANGOOGHLY-I_19-07-2013.csv to path /content/drive/My Drive/Gram Panchayat Wise
Selecting GP with key 1241
Waiting for page and elements to load.....
Polling date dictionary = OrderedDict([('2013-07-19', '19-07-2013')])
Selecting date 2013-07-19
Waiting for page and elements to load.....
Submitting form data.....
Waiting for page and elements to load.....
Fetched data from table = {0: [['Seat Name', 'Total 