# Python Web Scraping Notebook


This file scrapes data from administrative data from an Indian government website, specifically on water and sanitation state-level schemes.
    

In [None]:

# Importing the necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
    

In [None]:

# Defining the variables, districts, and block_list

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
homepage_url = "http://52.172.141.50/slwm/Dashboard.aspx"
driver.get(homepage_url)

wait = WebDriverWait(driver, 100)
cmb_fy_dropdown = driver.find_element(By.ID, 'cmb_fy')
select = Select(cmb_fy_dropdown)
select.select_by_visible_text("2022-2023")

link_element = driver.find_element(By.ID, 'lnk_tot_swachhta_karmi')
link_element.click()

elem = driver.find_element("xpath", "//*")
source_code = elem.get_attribute("outerHTML")
    

In [None]:

# Parsing the webpage content
soup = BeautifulSoup(source_code, 'lxml')
table = soup.find('table')
district = table.find_all('a')
districts = [title.text for title in district]
    

In [None]:

# Scraping data block-wise
block_name = []
for dis in districts:
    district_link = driver.find_element(By.LINK_TEXT, dis)
    district_link.click()
    
    elem = driver.find_element(By.XPATH, "//body")
    source_code = elem.get_attribute("outerHTML")
    soup = BeautifulSoup(source_code, 'lxml')
    table = soup.find('table')
    yo = table.find_all('a')
    block_name.extend(title.text for title in yo)
    
    driver.back()
    

In [None]:

# Extracting block-wise data and saving it in Excel files
for dis in districts:
    district_link = driver.find_element(By.LINK_TEXT, dis)
    district_link.click()
    
    with pd.ExcelWriter(f'{dis}_karmiii_data2223.xlsx', engine='xlsxwriter') as writer:
        for block in block_name:
            try:
                block_link = driver.find_element(By.LINK_TEXT, block)
                wait.until(EC.element_to_be_clickable((By.LINK_TEXT, block)))
                block_link.click()

                elem = driver.find_element(By.XPATH, "//body")
                source_code = elem.get_attribute("outerHTML")
                soup = BeautifulSoup(source_code, 'lxml')

                table = soup.find('table')
                table_rows = table.find_all('tr')
                world_titles = table.find_all('th')
                world_table_titles = [title.text for title in world_titles]

                data_rows = []
                for row in table_rows:
                    row_data = [data.text.strip() for data in row.find_all('td')]
                    data_rows.append(row_data)

                df = pd.DataFrame(data_rows, columns=world_table_titles)
                sheet_name = f'{block}'
                df.to_excel(writer, sheet_name=sheet_name, index=False)

                driver.execute_script("window.history.go(-1)")
            except Exception as e:
                print(f"Error processing {dis} - {block}: {str(e)}")
        driver.execute_script("window.history.go(-1)")
    

In [None]:

# Panchayat-wise data extraction
panchayat_list = []
for dis in districts:
    district_link = driver.find_element(By.LINK_TEXT, dis)
    district_link.click()

    for block in block_name:
        try:
            block_link = driver.find_element(By.LINK_TEXT, block)
            wait.until(EC.element_to_be_clickable((By.LINK_TEXT, block)))
            block_link.click()

            elem = driver.find_element(By.XPATH, "//body")
            source_code = elem.get_attribute("outerHTML")
            soup = BeautifulSoup(source_code, 'lxml')

            table = soup.find('table')
            table_rows = table.find_all('tr')
            world_titles = table.find_all('a')
            world_table_titles = [title.text for title in world_titles]

            panchayat_list.extend(world_table_titles)

            driver.back()
        except Exception as e:
            print(f"Error processing {dis} - {block}: {str(e)}")
    driver.back()
    