In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests

In [2]:
pd.options.display.max_colwidth=500

In [3]:
base_url="https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID="

In [4]:
toll_plazas_df=pd.read_csv('tool_plazas.csv',encoding='cp1252')

In [5]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)

In [6]:
toll_plazas_df.head()

Unnamed: 0,Sr No.,State,NH-No.,Toll Plaza Name,Toll Plaza Location,Section / Stretch,TollPlazaID,latitude,longitude,place_id,address_partial_match
0,1,Andhra Pradesh,16,Aganampudi,Km 728.055,Vishakhapatnam - Ankapalli [Km 2.837 to &Km 395.870 to Km358.00(New Chainage From Km 700.544 to Km 740.255)],236,17.685417,83.149951,ChIJyaj65ttuOToRH26fq-li1fY,False
1,2,Andhra Pradesh,7 (new 44),Amakathadu,Km 250.700,Hyderabad Bangalore (km 211.000 to km 462.164),258,15.486477,77.900943,ChIJG6ZfoqN_tjsR6hw94uU86QQ,False
2,3,Andhra Pradesh,NH-216,Annampalli,Annampalli,Gurajanapalli To pasarlapudi,5977,16.672228,82.14739,ChIJcw8uBaz1NzoRMva9Ii5taRE,False
3,4,Andhra Pradesh,221,Badava,35.8,Imbrahimpatnam to AP Telangana Border,4486,16.850475,80.633584,ChIJr0zTpNLaNToReLaGbihNfUU,False
4,5,Andhra Pradesh,NH40,Bandaplli,119.945 Bandaplli,Rayachoty Kadapa Section,5697,14.131199,78.756682,ChIJ2T3B-NwFszsRs_e_vnu63yw,True


In [7]:
toll_plazas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1053 entries, 0 to 1052
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sr No.                 1053 non-null   int64  
 1   State                  1053 non-null   object 
 2   NH-No.                 1053 non-null   object 
 3   Toll Plaza Name        1053 non-null   object 
 4   Toll Plaza Location    1053 non-null   object 
 5   Section / Stretch      1051 non-null   object 
 6   TollPlazaID            1053 non-null   int64  
 7   latitude               1053 non-null   float64
 8   longitude              1053 non-null   float64
 9   place_id               1053 non-null   object 
 10  address_partial_match  1053 non-null   bool   
dtypes: bool(1), float64(2), int64(2), object(6)
memory usage: 83.4+ KB


In [8]:
toll_plazas_df.shape[0]

1053

# Below data needs to be collected for each of the toll plaza

In [9]:
# def launch_toll_info_page(toll_plaza_id):
#     url = f"{base_url}{toll_plaza_id}"
#     # print(url)
#     return url
#     # driver.get(url)
    

### Construct all toll plaza urls

## 1) Toll info : Stretch,Tollable Length,Fee Effective Date,Due date of toll revision

In [10]:
def get_toll_info(soup):
    # print("Getting toll information ")
    p_tags = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    data = {"Stretch": [], "Tollable Length": [], "Fee Effective Date": [], "Due Date of Toll Revision": []}
    # Extract required details
    for text in p_tags:
        if "Stretch :" in text:
            data["Stretch"].append(text.split("Stretch :")[1].split("Tollable Length :")[0].strip())
        if "Tollable Length :" in text:
            data["Tollable Length"].append(text.split("Tollable Length :")[1].strip())
        if "Fee Effective Date :" in text:
            parts = text.split("/")
            data["Fee Effective Date"].append(parts[0].split("Fee Effective Date :")[1].strip())
            data["Due Date of Toll Revision"].append(parts[1].split("Due date of toll revision :")[1].strip())
    add_info_df = pd.DataFrame(data)
    return add_info_df  

## 2) Concessions

In [11]:
def get_concession_table1(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        concession_table = concession_section.find_next("span")
        # concession_data=concession_table.text
        concession_data = {'Concessions':[concession_table.text]}
    df_concession = pd.DataFrame(concession_data)
    return df_concession

In [12]:
def get_concession_table(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        try:
            concession_table = concession_section.find_next("span")
            # concession_data=concession_table.text
            concession_data = {'Concessions':[concession_table.text]}
            df_concession = pd.DataFrame(concession_data)
        except:
            concessions = []
            concession_list = concession_section.find_next("ul").find_all("li")
            for item in concession_list:
                concessions.append(item.text.strip())
            concessions=".".join(concessions)
            df_concession = pd.DataFrame([concessions], columns=["Concessions"])
    return df_concession

## 3) Facilities available near Toll Plaza

In [13]:
def get_facilities_near_toll(soup):
    # print("Getting Facilities available near Toll Plaza")
    facilities_section = soup.find("h2", string="Facilities available near Toll Plaza")
    facilities_data = []
    if facilities_section:
        facilities_table = facilities_section.find_next("div").find_all("table")
        for table in facilities_table:
            for row in table.find_all("tr"):
                cols = row.find_all("td")
                if len(cols) == 2:
                    # key = cols[0].text.strip()
                    key=cols[0].text.replace(":","").strip()
                    value = cols[1].text.strip()
                    facilities_data.append((key, value))
    df_facilities = pd.DataFrame(facilities_data, columns=["Facility", "Availability"])
    df_facilities=df_facilities.T
    df_facilities.reset_index(drop=True, inplace=True)
    df_facilities.columns = df_facilities.iloc[0] 
    df_facilities=df_facilities[1:].reset_index(drop=True)
    return df_facilities

## 4) Announcement

In [14]:
def get_announcement_info(soup):
    # print("Getting Announcement info")
    announcement_section = soup.find("h2", string="Announcement")
    announcements = []
    if announcement_section:
        announcement_list = announcement_section.find_next("ul").find_all("li")
        for item in announcement_list:
            announcements.append(item.text.strip())
    df_announcements = pd.DataFrame(announcements, columns=["Announcement"])
    return df_announcements

## 5) Important Information

In [15]:
def get_important_info(soup):
    # print("Getting Important Information table")
    important_info_section = soup.find("h2", string="Important Information")
    important_info_data = []
    if important_info_section:
        info_table = important_info_section.find_next("table")
        for row in info_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 2:
                key=cols[0].text.replace(":","").strip()
                value = cols[1].text.strip()
                important_info_data.append((key, value))
    df_important_info = pd.DataFrame(important_info_data, columns=["Category", "Details"])
    df_important_info=df_important_info.T.reset_index(drop=True)
    df_important_info.columns=df_important_info.iloc[0]
    df_important_info=df_important_info.iloc[1:]
    df_important_info.reset_index(drop=True,inplace=True)
    return df_important_info

## 6) About toll construction

In [16]:
def get_toll_construction_info(table):
    # print("Getting Project Information table")
    project_data = []
    for row in table.find_all("tr"):
        cols = row.find_all("td")
        if len(cols) == 2:
            key = cols[0].text.strip()
            value = cols[1].text.strip()
            project_data.append((key, value))
    df_project_info = pd.DataFrame(project_data, columns=["Parameter", "Value"])
    df_project_info=df_project_info.T.reset_index(drop=True)
    df_project_info.columns=df_project_info.iloc[0]
    df_project_info=df_project_info[1:].reset_index(drop=True)
    return df_project_info

## 7) Toll Fees for vehicles

In [17]:
def get_toll_fees(tables):
    # print("Getting Toll Fees for Vehicles")
    toll_data = []
    headers = [th.text.strip() for th in tables.find_all("th")]
    headers=headers[:5]
    for row in tables.find_all("tr")[1:]:  # Skip header row
        cols = row.find_all(["th", "td"])
        if len(cols) >0:
            toll_data.append([col.text.strip() for col in cols])
    df_toll_price = pd.DataFrame(toll_data, columns=headers)
    		
    rows=list(df_toll_price.index)
    cols=list(df_toll_price.columns)[1:]
    column_names=[]
    row_values=[]
    for row_idx in rows:
        for col_idx in cols:
            col_name = f"{df_toll_price.at[row_idx,'Type of vehicle']}_{col_idx}"
            column_names.append(col_name)
            row_values.append(df_toll_price.at[row_idx,col_idx])
    toll_price_df=pd.DataFrame([row_values],columns=column_names)
    return toll_price_df

# Main : Get html page objects for the given toll plaza. e.g tables, textual information

In [61]:
def get_html_page_objects(plaza_url):
    try:
        driver.get(plaza_url)
        html_content=driver.find_element(By.CSS_SELECTOR,'div.PA15').get_attribute("outerHTML")
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")
        
        add_info_df=get_toll_info(soup)
        # display(add_info_df)
        
        df_concession=get_concession_table(soup)
        # display(df_concession)
        
        df_facilities=get_facilities_near_toll(soup)
        # display(df_facilities)
        
        df_announcements=get_announcement_info(soup)
        # display(df_announcements)
        
        df_important_info=get_important_info(soup)
        # display(df_important_info)
        # Find all tables
        tables = soup.find_all("table", class_="tollinfotbl")
        df_project_info=get_toll_construction_info(tables[1])
        # display(df_project_info)
        
        toll_price_df=get_toll_fees(tables[0])
        # display(toll_price_df)
        
        frames = [add_info_df,df_concession,df_facilities,df_announcements,df_important_info,df_project_info,toll_price_df]
        derived_df=pd.concat(frames, axis=1)
    except Exception as e:
        print(type(e).__name__)
        return pd.DataFrame()
    return derived_df

In [19]:
print(f"Total number of toll plazas = {toll_plazas_df.shape[0]}")

Total number of toll plazas = 1053


In [87]:
toll_plaza_urls= []
print(f"Construct url for Toll Plaza Name:Toll Plaza ID:URL")
for idx in range(0,2):
    toll_plaza_name= toll_plazas_df['Toll Plaza Name'][idx]
    toll_plaza_id=toll_plazas_df['TollPlazaID'][idx]
    # print(toll_plaza_name,toll_plaza_id)
    plaza_url = f"{base_url}{toll_plaza_id}"
    print(f"{idx}:{toll_plaza_name}:{toll_plaza_id}:{plaza_url}")
    # plaza_url=launch_toll_info_page(toll_plaza_id)
    toll_plaza_urls.append(plaza_url)  

Construct url for Toll Plaza Name:Toll Plaza ID:URL
0:Aganampudi:236:https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236
1:Amakathadu:258:https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=258


In [88]:
print(f"Total number of toll plazas urls constructed = {len(toll_plaza_urls)}")

Total number of toll plazas urls constructed = 2


In [90]:
toll_plaza_urls.extend(['https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311','https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5939'])

In [91]:
toll_plaza_urls

['https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236',
 'https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=258',
 'https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311',
 'https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5939']

In [92]:
enriched_df = pd.DataFrame()
for url in toll_plaza_urls:
    print(url)
    toll_plaza_id_derived=int(url.split("=")[1])
    df=get_html_page_objects(url)
    if not df.empty:
        df.insert(0,'toll_plaza_id',toll_plaza_id_derived)
    else:
        print(f"No data received for {test_url}")
        df = pd.DataFrame(data = {'toll_plaza_id':[toll_plaza_id_derived]})
    enriched_df=enriched_df._append(df,ignore_index=True)

https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236
https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=258
https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311
NoSuchElementException
No data received for https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311
https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=5939


In [93]:
enriched_df

Unnamed: 0,toll_plaza_id,Stretch,Tollable Length,Fee Effective Date,Due Date of Toll Revision,Concessions,Rest Areas,Truck Lay byes,Static Weigh Bridge,Announcement,...,4 to 6 Axle_Monthly Pass,4 to 6 Axle_Commercial Vehicle Registered within the district of plaza,HCM/EME_Single Journey,HCM/EME_Return Journey,HCM/EME_Monthly Pass,HCM/EME_Commercial Vehicle Registered within the district of plaza,7 or more Axle_Single Journey,7 or more Axle_Return Journey,7 or more Axle_Monthly Pass,7 or more Axle_Commercial Vehicle Registered within the district of plaza
0,236,Vishakhapatnam - Ankapalli [Km 2.837 to &Km 395.870 to Km358.00(New Chainage From Km 700.544 to Km 740.255)],Km 40.707 Km(s),03-Jun-2024,31-Mar-2025,"The rates for monthly pass applicable for local non-commercial vehicle residing within a distance of 20 km from the toll plaza for the year 2024-25 shall be Rs.340/-, subject to revision every year as per the provisions of the 2008 Amended Fee Rules.",0,0.0,0,Toll rates due for revision w.e.f : 31 Mar 2025,...,11450.0,170.0,345.0,515.0,11450.0,170.0,420.0,625.0,13935.0,210.0
1,258,Hyderabad Bangalore (km 211.000 to km 462.164),84.000 Km(s),03-Jun-2024,31-Mar-2025,"Monthly pass applicable local non-commercial vehicles Residing with in radius of 20 km :Rs.340/-.The Concessionaire shall issue 50 or more one way tickets at a discounted rate equivalent to 2/3rd of the fee payable during the period of one month form the date of payment of fee. The amount indicated is for monthly pass for 50 one way trips. However, if any person asks more than 50 trips during a period of one month, the Concessionaire shall be required to issue such tickets at a discounted...",,,02 Nos.,Toll rates due for revision w.e.f : 31 Mar 2025,...,23835.0,355.0,715.0,1070.0,23835.0,355.0,870.0,1305.0,29015.0,435.0
2,311,,,,,,,,,,...,,,,,,,,,,
3,5939,Delhi to Agra,KM 63.220 Km(s),03-Jun-2024,31-Mar-2025,Toll rates due for revision w.e.f : 31 Mar 2025,Yes,,Yes,Toll rates due for revision w.e.f : 31 Mar 2025,...,16405.0,,490.0,740.0,16405.0,,600.0,900.0,19970.0,


In [83]:
toll_plaza_urls[168:170]

[]

In [82]:
test_url='https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311'
toll_plaza_id_derived=int(test_url.split("=")[1])
df=get_html_page_objects(test_url)
if not df.empty:
    df.insert(0,'toll_plaza_id',toll_plaza_id_derived)
else:
    print(f"No data received for {test_url}")
    df = pd.DataFrame(data = {'toll_plaza_id':[toll_plaza_id_derived]})

NoSuchElementException
No data received for https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311


In [78]:
df

Unnamed: 0,toll_plaza_id
0,311


In [79]:
df.T

Unnamed: 0,0
toll_plaza_id,311


In [48]:
driver.get('https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=311')
try:
    html_content=driver.find_element(By.CSS_SELECTOR,'div.PA15').get_attribute("outerHTML")
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
       print(type(e).__name__)

NoSuchElementException


# Toll price

In [334]:
driver.get("https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236")

In [335]:
try:
    html_content=driver.find_element(By.CSS_SELECTOR,'div.PA15').get_attribute("outerHTML")
except Exception as e:
    print(type(e).__name__)
    print(str(e))

In [336]:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Find all tables
tables = soup.find_all("table", class_="tollinfotbl")

In [337]:
# tables

# Extract Toll Information Table

In [338]:
toll_data = []
headers = [th.text.strip() for th in tables[0].find_all("th")]

In [339]:
headers=headers[:5]

In [340]:
for row in tables[0].find_all("tr")[1:]:  # Skip header row
    cols = row.find_all(["th", "td"])
    if len(cols) >0:
        toll_data.append([col.text.strip() for col in cols])

In [341]:
# toll_data

In [344]:
df_toll_price = pd.DataFrame(toll_data, columns=headers)

In [345]:
df_toll_price

Unnamed: 0,Type of vehicle,Single Journey,Return Journey,Monthly Pass,Commercial Vehicle Registered within the district of plaza
0,Car/Jeep/Van,135.0,200.0,4490.0,65.0
1,LCV,220.0,325.0,7255.0,110.0
2,Bus/Truck,455.0,685.0,15200.0,230.0
3,Upto 3 Axle Vehicle,495.0,745.0,16580.0,250.0
4,4 to 6 Axle,715.0,1070.0,23835.0,355.0
5,HCM/EME,715.0,1070.0,23835.0,355.0
6,7 or more Axle,870.0,1305.0,29015.0,435.0


In [252]:
# df.at[4, 'B'] = 10
# df.loc[5].at['B']

In [264]:
rows=list(df_toll_price.index)
rows

[0, 1, 2, 3, 4, 5, 6]

In [270]:
cols=list(df_toll_price.columns)[1:]
cols

['Single Journey',
 'Return Journey',
 'Monthly Pass',
 'Commercial Vehicle Registered within the district of plaza']

In [277]:
column_names=[]
row_values=[]
for row_idx in rows:
    for col_idx in cols:
        col_name = f"{df_toll_price.at[row_idx,'Type of vehicle']}_{col_idx}"
        column_names.append(col_name)
        row_values.append(df_toll_price.at[row_idx,col_idx])
        # print(col_name,df_toll_price.at[row_idx,col_idx])
    # print("")    

In [293]:
toll_price_df=pd.DataFrame([row_values],columns=column_names)

In [294]:
toll_price_df

Unnamed: 0,Car/Jeep/Van_Single Journey,Car/Jeep/Van_Return Journey,Car/Jeep/Van_Monthly Pass,Car/Jeep/Van_Commercial Vehicle Registered within the district of plaza,LCV_Single Journey,LCV_Return Journey,LCV_Monthly Pass,LCV_Commercial Vehicle Registered within the district of plaza,Bus/Truck_Single Journey,Bus/Truck_Return Journey,...,4 to 6 Axle_Monthly Pass,4 to 6 Axle_Commercial Vehicle Registered within the district of plaza,HCM/EME_Single Journey,HCM/EME_Return Journey,HCM/EME_Monthly Pass,HCM/EME_Commercial Vehicle Registered within the district of plaza,7 or more Axle_Single Journey,7 or more Axle_Return Journey,7 or more Axle_Monthly Pass,7 or more Axle_Commercial Vehicle Registered within the district of plaza
0,65.0,95.0,2155.0,30.0,105.0,155.0,3485.0,50.0,220.0,330.0,...,11450.0,170.0,345.0,515.0,11450.0,170.0,420.0,625.0,13935.0,210.0


In [295]:
toll_price_df.T

Unnamed: 0,0
Car/Jeep/Van_Single Journey,65.0
Car/Jeep/Van_Return Journey,95.0
Car/Jeep/Van_Monthly Pass,2155.0
Car/Jeep/Van_Commercial Vehicle Registered within the district of plaza,30.0
LCV_Single Journey,105.0
LCV_Return Journey,155.0
LCV_Monthly Pass,3485.0
LCV_Commercial Vehicle Registered within the district of plaza,50.0
Bus/Truck_Single Journey,220.0
Bus/Truck_Return Journey,330.0


# Extract Project Information Table

In [233]:
project_data = []
for row in tables[1].find_all("tr"):
    cols = row.find_all("td")
    if len(cols) == 2:
        key = cols[0].text.strip()
        value = cols[1].text.strip()
        project_data.append((key, value))
df_project_info = pd.DataFrame(project_data, columns=["Parameter", "Value"])
df_project_info=df_project_info.T.reset_index(drop=True)
df_project_info.columns=df_project_info.iloc[0]
df_project_info=df_project_info[1:].reset_index(drop=True)


In [234]:
df_project_info

Unnamed: 0,Date of fee notification,Commercial Operation Date,Fee Rule,Capital Cost of Project (in Rs. Cr.),Cumulative Toll Revenue (in Rs. Cr.),Concessions Period,Design Capacity (PCU),Traffic (PCU/day),Target Traffic (PCU/day),Name of Concessionaire / OMT Contractor,Name / Contact Details of Incharge
0,26-Jul-2012 (Sr No. - S.O.1709(E)),01-Dec-2001,2008 as amended with transition,63.54,176.66 (With Discounting) (For Entire Project Stretch) As on :31-Dec-2016,01.11.16 - 31.01.17,40000,52033 As on : 26-May-2017,48811 As on : 31-Dec-2017,Uday Kiran Chitturi,Srinu / 8130006277


In [235]:
# df_project_info=df_project_info.T.reset_index(drop=True)
# df_project_info.columns=df_project_info.iloc[0]
# df_project_info=df_project_info[1:].reset_index(drop=True)
# df_project_info

In [22]:
df_project_info = pd.read_csv('project_information.csv')

In [23]:
df_project_info

Unnamed: 0,Parameter,Value
0,Date of fee notification,20-Jun-2011 (Sr No. - S.O.1423(E))
1,Commercial Operation Date,22-Nov-11
2,Fee Rule,2008
3,Capital Cost of Project (in Rs. Cr.),2550
4,Cumulative Toll Revenue (in Rs. Cr.),318.86 (For Entire Project Stretch) As on :31-Dec-2016
5,Concessions Period,18.11.2011-20.11.2041
6,Design Capacity (PCU),120000
7,Traffic (PCU/day),24034 As on : 31-Mar-2017
8,Target Traffic (PCU/day),24769 As on : 31-Mar-2014
9,Name of Concessionaire / OMT Contractor,Simhapuri Expressway Limited


In [32]:
toll_plazas_df.head()

Unnamed: 0,Sr No.,State,NH-No.,Toll Plaza Name,Toll Plaza Location,Section / Stretch,TollPlazaID
0,1,Andhra Pradesh,16,Aganampudi,Km 728.055,Vishakhapatnam - Ankapalli [Km 2.837 to &Km 395.870 to Km358.00(New Chainage From Km 700.544 to Km 740.255)],236
1,2,Andhra Pradesh,7 (new 44),Amakathadu,Km 250.700,Hyderabad Bangalore (km 211.000 to km 462.164),258
2,3,Andhra Pradesh,NH-216,Annampalli,Annampalli,Gurajanapalli To pasarlapudi,5977
3,4,Andhra Pradesh,221,Badava,35.800,Imbrahimpatnam to AP Telangana Border,4486
4,5,Andhra Pradesh,NH40,Bandaplli,119.945 Bandaplli,Rayachoty Kadapa Section,5697


In [33]:
toll_plaza_id

258

In [36]:
df_project_info =df_project_info.T

In [37]:
df_project_info.columns = df_project_info.iloc[0]  # Set first row as column names
df_project_info = df_project_info[1:].reset_index(drop=True)  # Drop the first row and reset index
df_project_info

Parameter,Date of fee notification,Commercial Operation Date,Fee Rule,Capital Cost of Project (in Rs. Cr.),Cumulative Toll Revenue (in Rs. Cr.),Concessions Period,Design Capacity (PCU),Traffic (PCU/day),Target Traffic (PCU/day),Name of Concessionaire / OMT Contractor,Name / Contact Details of Incharge
0,20-Jun-2011 (Sr No. - S.O.1423(E)),22-Nov-11,2008,2550,318.86 (For Entire Project Stretch) As on :31-Dec-2016,18.11.2011-20.11.2041,120000,24034 As on : 31-Mar-2017,24769 As on : 31-Mar-2014,Simhapuri Expressway Limited,TRILOK SINGH / 8130006335


In [38]:
df_project_info['TollPlazaID']= toll_plaza_id
df_project_info

Parameter,Date of fee notification,Commercial Operation Date,Fee Rule,Capital Cost of Project (in Rs. Cr.),Cumulative Toll Revenue (in Rs. Cr.),Concessions Period,Design Capacity (PCU),Traffic (PCU/day),Target Traffic (PCU/day),Name of Concessionaire / OMT Contractor,Name / Contact Details of Incharge,TollPlazaID
0,20-Jun-2011 (Sr No. - S.O.1423(E)),22-Nov-11,2008,2550,318.86 (For Entire Project Stretch) As on :31-Dec-2016,18.11.2011-20.11.2041,120000,24034 As on : 31-Mar-2017,24769 As on : 31-Mar-2014,Simhapuri Expressway Limited,TRILOK SINGH / 8130006335,258


## Concessions

In [410]:
concession_section = soup.find("h2", string="Concessions")

In [413]:
concession_section.findParent()

<div class="cant-notifi" style="border:1px solid #e4e4e4; width :350px;"><h2>Concessions</h2><ul><li>Monthly pass applicable local non-commercial vehicles  Residing with in radius of 20 km  :Rs.340/-</li><li>The Concessionaire shall issue  50 or more one way tickets at a discounted rate equivalent to 2/3rd of the fee payable during the period of one month form the date of payment of fee. The amount indicated is for monthly pass for 50 one way trips. However, if any person asks more than 50 trips during a period of one month, the Concessionaire shall be required to issue such tickets at a discounted rate as above.</li></ul> </div>

In [None]:
concession_section.find_next_siblings()

In [414]:
concession_section.find_next_siblings()

[<ul><li>Monthly pass applicable local non-commercial vehicles  Residing with in radius of 20 km  :Rs.340/-</li><li>The Concessionaire shall issue  50 or more one way tickets at a discounted rate equivalent to 2/3rd of the fee payable during the period of one month form the date of payment of fee. The amount indicated is for monthly pass for 50 one way trips. However, if any person asks more than 50 trips during a period of one month, the Concessionaire shall be required to issue such tickets at a discounted rate as above.</li></ul>]

In [403]:
concession_section = soup.find("h2", string="Concessions")
concessions = []
if concession_section:
    concession_list = concession_section.find_next("ul").find_all("li")
    for item in concession_list:
        concessions.append(item.text.strip())
concessions=".".join(concessions)     
df_concessions = pd.DataFrame([concessions], columns=["Concessions"])
df_concessions

Unnamed: 0,Concessions
0,"Monthly pass applicable local non-commercial vehicles Residing with in radius of 20 km :Rs.340/-.The Concessionaire shall issue 50 or more one way tickets at a discounted rate equivalent to 2/3rd of the fee payable during the period of one month form the date of payment of fee. The amount indicated is for monthly pass for 50 one way trips. However, if any person asks more than 50 trips during a period of one month, the Concessionaire shall be required to issue such tickets at a discounted..."


In [415]:
def get_concession_table(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        try:
            concession_table = concession_section.find_next("span")
            # concession_data=concession_table.text
            concession_data = {'Concessions':[concession_table.text]}
            df_concession = pd.DataFrame(concession_data)
        except:
            concessions = []
            concession_list = concession_section.find_next("ul").find_all("li")
            for item in concession_list:
                concessions.append(item.text.strip())
            concessions=".".join(concessions)
            df_concession = pd.DataFrame([concessions], columns=["Concessions"])
    return df_concession

In [350]:
concession_section = soup.find("h2", string="Concessions")
try:
    concession_table = concession_section.find_next("span")
    concession_data = {'Concessions':[concession_table.text]}
    concession_data
except:
    

AttributeError: 'NoneType' object has no attribute 'text'

In [63]:
pd.DataFrame(concession_data)

Unnamed: 0,Concessions
0,"The rates for monthly pass applicable for local non-commercial vehicle residing within a distance of 20 km from the toll plaza for the year 2024-25 shall be Rs.340/-, subject to revision every year as per the provisions of the 2008 Amended Fee Rules."


In [None]:
def get_concession_table(soup):
    # print("Getting Concessions table")
    concession_section = soup.find("h2", string="Concessions")
    if concession_section:
        concession_table = concession_section.find_next("span")
        # concession_data=concession_table.text
        concession_data = {'Concessions':[concession_table.text]}
    df_concession = pd.DataFrame(concession_data)
    return df_concession

# Extract Facilities Available Near Toll Plaza

In [188]:
facilities_section = soup.find("h2", string="Facilities available near Toll Plaza")
facilities_data = []
if facilities_section:
    facilities_table = facilities_section.find_next("div").find_all("table")
    for table in facilities_table:
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 2:
                # key = cols[0].text.strip()
                key=cols[0].text.replace(":","").strip()
                value = cols[1].text.strip()
                facilities_data.append((key, value))

df_facilities = pd.DataFrame(facilities_data, columns=["Facility", "Availability"])
df_facilities=df_facilities.T
df_facilities.reset_index(drop=True, inplace=True)
df_facilities.columns = df_facilities.iloc[0] 
df_facilities=df_facilities[1:].reset_index(drop=True)
df_facilities


Unnamed: 0,Rest Areas,Truck Lay byes,Static Weigh Bridge
0,0,0,0


# Extract Announcements

In [195]:
announcement_section = soup.find("h2", string="Announcement")
announcements = []
if announcement_section:
    announcement_list = announcement_section.find_next("ul").find_all("li")
    for item in announcement_list:
        announcements.append(item.text.strip())
df_announcements = pd.DataFrame(announcements, columns=["Announcement"])        

In [196]:
df_announcements

Unnamed: 0,Announcement
0,Toll rates due for revision w.e.f : 31 Mar 2025


# Extract Important Information

In [209]:
important_info_section = soup.find("h2", string="Important Information")
important_info_data = []
if important_info_section:
    info_table = important_info_section.find_next("table")
    for row in info_table.find_all("tr"):
        cols = row.find_all("td")
        if len(cols) == 2:
            key=cols[0].text.replace(":","").strip()
            value = cols[1].text.strip()
            important_info_data.append((key, value))
df_important_info = pd.DataFrame(important_info_data, columns=["Category", "Details"])
df_important_info=df_important_info.T.reset_index(drop=True)
df_important_info.columns=df_important_info.iloc[0]
df_important_info=df_important_info.iloc[1:]
df_important_info.reset_index(drop=True,inplace=True)
df_important_info

Unnamed: 0,Helpline No.,Emergency Services,Nearest Police Station,Highway Administrator (Project Director),Project Implementation Unit(PIU),Regional Office(RO),Representative of Consultant,Representative of Concessionaire,Nearest Hospital(s)
0,"Crane-7993084799, Ambulance -7416511921, Route Patrol-9391188526",9,/,Sh. Prabhat Ranjan / 8130006265,PIU Vishakapatnam,RO Vijaywada,,Mr Ravindra / 7659859650,"1)Sri Satya Sai Hospital Beside SBI Aganampudi Visakhapatnam, Andhra Pradesh Ph.: 098486 39329, 2)RK Hospital Gajuwaka Visakhapatnam, Andhra Pradesh 530044"


# Additional information 

In [14]:
p_tags = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
data = {"Stretch": [], "Tollable Length": [], "Fee Effective Date": [], "Due Date of Toll Revision": []}

NameError: name 'soup' is not defined

In [200]:
# Extract required details
for text in p_tags:
    if "Stretch :" in text:
        data["Stretch"].append(text.split("Stretch :")[1].split("Tollable Length :")[0].strip())
    if "Tollable Length :" in text:
        data["Tollable Length"].append(text.split("Tollable Length :")[1].strip())
    if "Fee Effective Date :" in text:
        parts = text.split("/")
        data["Fee Effective Date"].append(parts[0].split("Fee Effective Date :")[1].strip())
        data["Due Date of Toll Revision"].append(parts[1].split("Due date of toll revision :")[1].strip())
# Convert to DataFrame
add_info_df = pd.DataFrame(data)

In [201]:
add_info_df

Unnamed: 0,Stretch,Tollable Length,Fee Effective Date,Due Date of Toll Revision
0,Chilkaluripet - Nellore (Km 1182.802 - Km 1366.547),70.945 Km(s),03-Jun-2024,31-Mar-2025
