In [1]:
import os
from time import sleep
import pandas as pd

import ssl
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

## User Input

In [9]:
waitTime = 3 # Time in seconds between web reqeust
state_id = {'Assam':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS0334.htm',
                    'id':'S03'},
           'Kerala':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS11115.htm',
                    'id':'S11'},
           'Puducherry':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseU0719.htm',
                    'id':'U07'},
           'Tamil Nadu':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS2228.htm',
                    'id':'S22'},
           'West Bengal':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm',
                    'id':'S25'}}

state_id = {'Tamil Nadu':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS2228.htm',
                    'id':'S22'},
           'West Bengal':{'url':'https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm',
                    'id':'S25'}}

## Function for scrapping

In [3]:
def download_page(url):
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url, headers=hdr)
    gcontext = ssl.SSLContext()
    page = urlopen(req, context=gcontext)
    soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
    return soup

In [4]:
def build_constituencyid(state_id):
    constituencyid = {}
    # Download any one page to get all ids
    state_names = list(state_id.keys())
    soup = download_page(state_id[state_names[0]]['url'])
    for state in state_id:
        id_names = soup.find("input", {"id": state_id[state]['id']}).get('value')
        id_names = id_names.strip().split(';')
        constituencyid[state] = {}
        for id_name in id_names:
            # Last element is empty handle it with if condition
            if id_name != '':
                cid, name = id_name.split(',')
                cid = int(cid)
                name = name.strip()
                constituencyid[state][cid] = name
    return constituencyid

In [5]:
def generate_url(base_url, cid):
    url = "{}?ac={}".format(base_url, cid)
    return url

In [6]:
def parse_content(page_content, cid, cname):
    # Get the main result table using its style
    table_style = 'margin: auto; width: 100%; font-family: Verdana; border: solid 1px black;font-weight:lighter'
    mytable = page_content.find_all('table', attrs={'style':table_style})
    # Usually it will be a list, take the first element
    mytable = mytable[0]
    # Get each row of result table
    rows = mytable.find_all('tr', attrs={'style':'font-size:12px;'})
    result = []
    for row in rows:
        cols = list(row.children)
        result.append({
            'cid':cid,
            'cname':cname,
            'O.S.N.':cols[0].text,
            'Candidate':cols[1].text,
            'Party':cols[2].text,
            'EVM Votes':cols[3].text,
            'Postal Votes':cols[4].text,
            'Total Votes':cols[5].text,
            'PctVotes':cols[6].text
        })
    result = pd.DataFrame(result)
    return result

In [10]:
# Define a function that is reusable.
# This extracts data for single state
def scraper(base_url, constituencyid, output_path, known_cid):
    for cid in sorted(constituencyid):
        if cid in known_cid:
            continue
        url = generate_url(base_url, cid)
        print("URL: ", url, end=' ')
        page_content = download_page(url)
        print("Downloaded", end=' ')
        result = parse_content(page_content, cid, constituencyid[cid])
        print("Parsed", end=' ')
        if len(result) > 0:
            with open(output_path, 'a') as f:
                result.to_csv(f, header=f.tell() == 0, index=False)
            print("Dumped", end=' ')

        # Take time between requests
        print('Waiting')
        sleep(waitTime)

## Main Function

In [None]:
constituencyid = build_constituencyid(state_id)
for state in state_id:
    filename = r"{}.csv".format(state)
    output_path = os.path.join('Data', filename)
    print("Processing state: {}, output_file: {}".format(state, output_path))
    known_cid = set()
    if os.path.exists(output_path):
        data = pd.read_csv(output_path)
        known_cid = set(data['cid'].unique())
    scraper(state_id[state]['url'], constituencyid[state], output_path, known_cid)

Processing state: Tamil Nadu, output_file: Data/Tamil Nadu.csv
Processing state: West Bengal, output_file: Data/West Bengal.csv
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=73 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=74 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=75 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=76 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=77 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=78 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=79 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS2512.htm?ac=80 Downloaded Parsed Dumped
URL:  https://results.eci.gov.in/Result2021/ConstituencywiseS251