In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import logging

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s",datefmt="%Y-%m-%d %H:%M:%S",force=True)

In [3]:
pd.options.display.max_colwidth=500

In [4]:
base_url="https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID="

In [5]:
toll_plazas_df=pd.read_csv('tool_plazas.csv',encoding='cp1252')

In [6]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)

In [7]:
toll_plazas_df.head()

Unnamed: 0,Sr No.,State,NH-No.,Toll Plaza Name,Toll Plaza Location,Section / Stretch,TollPlazaID,latitude,longitude,place_id,address_partial_match
0,1,Andhra Pradesh,16,Aganampudi,Km 728.055,Vishakhapatnam - Ankapalli [Km 2.837 to &Km 395.870 to Km358.00(New Chainage From Km 700.544 to Km 740.255)],236,17.685417,83.149951,ChIJyaj65ttuOToRH26fq-li1fY,False
1,2,Andhra Pradesh,7 (new 44),Amakathadu,Km 250.700,Hyderabad Bangalore (km 211.000 to km 462.164),258,15.486477,77.900943,ChIJG6ZfoqN_tjsR6hw94uU86QQ,False
2,3,Andhra Pradesh,NH-216,Annampalli,Annampalli,Gurajanapalli To pasarlapudi,5977,16.672228,82.14739,ChIJcw8uBaz1NzoRMva9Ii5taRE,False
3,4,Andhra Pradesh,221,Badava,35.8,Imbrahimpatnam to AP Telangana Border,4486,16.850475,80.633584,ChIJr0zTpNLaNToReLaGbihNfUU,False
4,5,Andhra Pradesh,NH40,Bandaplli,119.945 Bandaplli,Rayachoty Kadapa Section,5697,14.131199,78.756682,ChIJ2T3B-NwFszsRs_e_vnu63yw,True


In [8]:
toll_plazas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1053 entries, 0 to 1052
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sr No.                 1053 non-null   int64  
 1   State                  1053 non-null   object 
 2   NH-No.                 1053 non-null   object 
 3   Toll Plaza Name        1053 non-null   object 
 4   Toll Plaza Location    1053 non-null   object 
 5   Section / Stretch      1051 non-null   object 
 6   TollPlazaID            1053 non-null   int64  
 7   latitude               1053 non-null   float64
 8   longitude              1053 non-null   float64
 9   place_id               1053 non-null   object 
 10  address_partial_match  1053 non-null   bool   
dtypes: bool(1), float64(2), int64(2), object(6)
memory usage: 83.4+ KB


In [9]:
toll_plazas_df.shape[0]

1053

# Below data needs to be collected for each of the toll plaza

In [10]:
# def launch_toll_info_page(toll_plaza_id):
#     url = f"{base_url}{toll_plaza_id}"
#     # print(url)
#     return url
#     # driver.get(url)
    

### Construct all toll plaza urls

## 1) Toll info : Stretch,Tollable Length,Fee Effective Date,Due date of toll revision

In [11]:
def get_toll_info(soup):
    # print("Getting toll information ")
    p_tags = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    data = {"Stretch": [], "Tollable Length": [], "Fee Effective Date": [], "Due Date of Toll Revision": []}
    # Extract required details
    for text in p_tags:
        if "Stretch :" in text:
            data["Stretch"].append(text.split("Stretch :")[1].split("Tollable Length :")[0].strip())
        if "Tollable Length :" in text:
            data["Tollable Length"].append(text.split("Tollable Length :")[1].strip())
        if "Fee Effective Date :" in text:
            parts = text.split("/")
            data["Fee Effective Date"].append(parts[0].split("Fee Effective Date :")[1].strip())
            data["Due Date of Toll Revision"].append(parts[1].split("Due date of toll revision :")[1].strip())
    add_info_df = pd.DataFrame(data)
    return add_info_df  

## 2) Concessions

In [12]:
def get_concession_table1(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        concession_table = concession_section.find_next("span")
        # concession_data=concession_table.text
        concession_data = {'Concessions':[concession_table.text]}
    df_concession = pd.DataFrame(concession_data)
    return df_concession

In [13]:
def get_concession_table(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        try:
            concession_table = concession_section.find_next("span")
            # concession_data=concession_table.text
            concession_data = {'Concessions':[concession_table.text]}
            df_concession = pd.DataFrame(concession_data)
        except:
            concessions = []
            concession_list = concession_section.find_next("ul").find_all("li")
            for item in concession_list:
                concessions.append(item.text.strip())
            concessions=".".join(concessions)
            df_concession = pd.DataFrame([concessions], columns=["Concessions"])
    return df_concession

## 3) Facilities available near Toll Plaza

In [14]:
def get_facilities_near_toll(soup):
    # print("Getting Facilities available near Toll Plaza")
    facilities_section = soup.find("h2", string="Facilities available near Toll Plaza")
    facilities_data = []
    if facilities_section:
        facilities_table = facilities_section.find_next("div").find_all("table")
        for table in facilities_table:
            for row in table.find_all("tr"):
                cols = row.find_all("td")
                if len(cols) == 2:
                    # key = cols[0].text.strip()
                    key=cols[0].text.replace(":","").strip()
                    value = cols[1].text.strip()
                    facilities_data.append((key, value))
    df_facilities = pd.DataFrame(facilities_data, columns=["Facility", "Availability"])
    df_facilities=df_facilities.T
    df_facilities.reset_index(drop=True, inplace=True)
    df_facilities.columns = df_facilities.iloc[0] 
    df_facilities=df_facilities[1:].reset_index(drop=True)
    return df_facilities

## 4) Announcement

In [15]:
def get_announcement_info(soup):
    # print("Getting Announcement info")
    announcement_section = soup.find("h2", string="Announcement")
    announcements = []
    if announcement_section:
        announcement_list = announcement_section.find_next("ul").find_all("li")
        for item in announcement_list:
            announcements.append(item.text.strip())
    df_announcements = pd.DataFrame(announcements, columns=["Announcement"])
    return df_announcements

## 5) Important Information

In [16]:
def get_important_info(soup):
    # print("Getting Important Information table")
    important_info_section = soup.find("h2", string="Important Information")
    important_info_data = []
    if important_info_section:
        info_table = important_info_section.find_next("table")
        for row in info_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 2:
                key=cols[0].text.replace(":","").strip()
                value = cols[1].text.strip()
                important_info_data.append((key, value))
    df_important_info = pd.DataFrame(important_info_data, columns=["Category", "Details"])
    df_important_info=df_important_info.T.reset_index(drop=True)
    df_important_info.columns=df_important_info.iloc[0]
    df_important_info=df_important_info.iloc[1:]
    df_important_info.reset_index(drop=True,inplace=True)
    return df_important_info

## 6) About toll construction

In [17]:
def get_toll_construction_info(table):
    # print("Getting Project Information table")
    project_data = []
    for row in table.find_all("tr"):
        cols = row.find_all("td")
        if len(cols) == 2:
            key = cols[0].text.strip()
            value = cols[1].text.strip()
            project_data.append((key, value))
    df_project_info = pd.DataFrame(project_data, columns=["Parameter", "Value"])
    df_project_info=df_project_info.T.reset_index(drop=True)
    df_project_info.columns=df_project_info.iloc[0]
    df_project_info=df_project_info[1:].reset_index(drop=True)
    return df_project_info

## 7) Toll Fees for vehicles

In [18]:
def get_toll_fees(tables):
    # print("Getting Toll Fees for Vehicles")
    toll_data = []
    headers = [th.text.strip() for th in tables.find_all("th")]
    headers=headers[:5]
    for row in tables.find_all("tr")[1:]:  # Skip header row
        cols = row.find_all(["th", "td"])
        if len(cols) >0:
            toll_data.append([col.text.strip() for col in cols])
    df_toll_price = pd.DataFrame(toll_data, columns=headers)
    		
    rows=list(df_toll_price.index)
    cols=list(df_toll_price.columns)[1:]
    column_names=[]
    row_values=[]
    for row_idx in rows:
        for col_idx in cols:
            col_name = f"{df_toll_price.at[row_idx,'Type of vehicle']}_{col_idx}"
            column_names.append(col_name)
            row_values.append(df_toll_price.at[row_idx,col_idx])
    toll_price_df=pd.DataFrame([row_values],columns=column_names)
    return toll_price_df

# Get html page objects for the given toll plaza. e.g tables, textual information

In [38]:
def get_html_page_objects(plaza_url):
    try:
        driver.get(plaza_url)
        html_content=driver.find_element(By.CSS_SELECTOR,'div.PA15').get_attribute("outerHTML")
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")
        
        add_info_df=get_toll_info(soup)
        # display(add_info_df)
        
        df_concession=get_concession_table(soup)
        # display(df_concession)
        
        df_facilities=get_facilities_near_toll(soup)
        # display(df_facilities)
        
        df_announcements=get_announcement_info(soup)
        # display(df_announcements)
        
        df_important_info=get_important_info(soup)
        # display(df_important_info)
        # Find all tables
        tables = soup.find_all("table", class_="tollinfotbl")
        df_project_info=get_toll_construction_info(tables[1])
        # display(df_project_info)
        
        toll_price_df=get_toll_fees(tables[0])
        # display(toll_price_df)
        
        frames = [add_info_df,df_concession,df_facilities,df_announcements,df_important_info,df_project_info,toll_price_df]
        derived_df=pd.concat(frames, axis=1)
        # print(derived_df.shape)
        logging.debug(f"Derived data frame shape = {derived_df.shape}")
    except Exception as e:
        # print(type(e).__name__)
        return pd.DataFrame()
    return derived_df

# Main starts here

In [35]:
print(f"Total number of toll plazas = {toll_plazas_df.shape[0]}")

Total number of toll plazas = 1053


In [36]:
# toll_plaza_urls[0:2]

In [37]:
header_row ="SR.No|Toll Plaza Name|Toll Plaza ID|URL|Status"
print(header_row)
with open("enriched_data_log.txt", 'a') as file:
    file.writelines(header_row.replace("|",",")+"\n")
enriched_df = pd.DataFrame()
for idx in range(0,toll_plazas_df.shape[0]): #toll_plazas_df.shape[0]
    toll_plaza_name= toll_plazas_df['Toll Plaza Name'][idx]
    toll_plaza_id=toll_plazas_df['TollPlazaID'][idx]
    plaza_url = f"{base_url}{toll_plaza_id}"

    df=get_html_page_objects(plaza_url)
    if not df.empty:
        df.insert(0,'toll_plaza_id',toll_plaza_id)
        status = "PASS"
    else:
        df = pd.DataFrame(data = {'toll_plaza_id':[toll_plaza_id]})
        status = "FAIL"
    enriched_df=enriched_df._append(df,ignore_index=True)
    log = f"{idx},{toll_plaza_name},{toll_plaza_id},{plaza_url},{status}"
    print(log.replace(",","|"))
    with open("enriched_data_log.txt", 'a') as file:
        file.writelines(log+"\n")

SR.No|Toll Plaza Name|Toll Plaza ID|URL|Status
0|Aganampudi|236|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236|PASS
1|Amakathadu|258|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=258|PASS
2|Annampalli|5977|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5977|PASS
3|Badava|4486|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=4486|PASS
4|Bandaplli|5697|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5697|PASS
5|Bandlapalli|5952|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5952|PASS
6|Basapuram|4542|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=4542|PASS
7|Bathalapalli|5753|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5753|PASS
8|Bellupada|233|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=233|PASS
9|Bollapalli|252|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=252|PASS
10|Brahmanapalli|4495|https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=4495|PASS
11|Buchireddypalem|4557|ht

In [95]:
enriched_df.shape

(1065, 58)

In [96]:
toll_plazas_df.shape

(1053, 11)

In [101]:
merged_df=toll_plazas_df.merge(enriched_df,left_on='TollPlazaID',right_on='toll_plaza_id',how='left')

In [120]:
# merged_df.iloc[319:321][['Toll Plaza Name','Concessions']]

In [118]:
merged_df.iloc[319][['Concessions']].values

array([''], dtype=object)

In [102]:
merged_df.drop_duplicates(subset=['Sr No.'],keep='first',inplace=True)

In [103]:
merged_df.shape

(1053, 69)

In [105]:
import numpy as np

In [119]:
merged_df.replace(np.NaN,'NA',inplace=True)
merged_df.replace('','NA',inplace=True)

In [122]:
merged_df.to_csv('toll_plaza_data.csv',index=False)