In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import datetime
import logging
import os

In [2]:
OUTPUT_FILE_NAME = 'tool_plazas.csv'

#  Set the logger 

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s",datefmt="%Y-%m-%d %H:%M:%S",force=True)
# logging.info("This is an info message.")

In [4]:
start_time = datetime.datetime.now()

# Set up Selenium WebDriver 

In [5]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
driver = webdriver.Chrome(options=options)

# Open the nhai website for WebScrapping

In [6]:
url = "https://tis.nhai.gov.in/tollplazasataglance.aspx?language=en#"
logging.info("Launching the webpage")
driver.get(url)
driver.implicitly_wait(10)

2025-04-24 16:39:39 - INFO - Launching the webpage


# Locate the table containing list of toll plaza on the webpage

In [7]:
table_element=driver.find_element(By.ID, 'tollList')

# Get all rows of the table containing data

In [8]:
rows = table_element.find_elements(By.XPATH,"//div[@id='tollList']/table/tbody/tr")

# Split table in to header and rows with data

In [9]:
table_header = rows[0:1] # header
table_data = rows[1:] # data 

# Function to write data to the file line by line

In [10]:
if os.path.exists(OUTPUT_FILE_NAME):
    logging.info(f"The file already exist.Deleting {OUTPUT_FILE_NAME}")
    os.remove(OUTPUT_FILE_NAME)
else:
    logging.info(f"The file does not exist.Creating {OUTPUT_FILE_NAME}")

2025-04-24 16:39:51 - INFO - The file already exist.Deleting tool_plazas.csv


In [11]:
def write_to_file(data):
    # print(data)
    with open(OUTPUT_FILE_NAME, "a") as file:
        line=','.join(data)
        # print(line)
        file.write(line)
        file.write("\n")

# Get the header of the table

In [12]:
logging.info("Getting the table header")

2025-04-24 16:39:56 - INFO - Getting the table header


In [13]:
for header_idx in range(0,len(table_header)):
    cells=table_header[header_idx].find_elements(By.TAG_NAME,'th')
    temp_rows=[]
    for cell_idx in range(0,len(cells)):
        data=cells[cell_idx].text
        # print(data)
        temp_rows.append(data)
    temp_rows.append('TollPlazaID')    
    # print(temp_rows)
    logging.debug(temp_rows)
    write_to_file(temp_rows)

# Extract each row of the table one at a time

In [14]:
logging.info("Getting the table rows and writing to the file.")

2025-04-24 16:40:06 - INFO - Getting the table rows and writing to the file.


In [15]:
extracted_rows=[]
for table_idx in range(0,len(table_data)):#len(table_data)
    cells=table_data[table_idx].find_elements(By.TAG_NAME,'td')
    toll_plaza_id=table_data[table_idx].find_element(By.XPATH,"td/a").get_dom_attribute('onclick').split("(")[1].split(")")[0]
    temp_rows=[]
    for cell_idx in range(0,len(cells)):
        data=cells[cell_idx].text
        if "," in data:
            data = data.replace(',',';')
        temp_rows.append(data)
    temp_rows.append(toll_plaza_id)
    logging.info(temp_rows)
    extracted_rows.append(temp_rows)

2025-04-24 16:40:06 - INFO - ['1', 'Andhra Pradesh', '16', 'Aganampudi', 'Km 728.055', 'Vishakhapatnam - Ankapalli [Km 2.837 to &Km 395.870 to Km358.00(New Chainage From Km 700.544 to Km 740.255)]', '236']
2025-04-24 16:40:06 - INFO - ['2', 'Andhra Pradesh', '7 (new 44)', 'Amakathadu', 'Km 250.700', 'Hyderabad Bangalore (km 211.000 to km 462.164)', '258']
2025-04-24 16:40:06 - INFO - ['3', 'Andhra Pradesh', 'NH-216', 'Annampalli', 'Annampalli', 'Gurajanapalli To pasarlapudi', '5977']
2025-04-24 16:40:07 - INFO - ['4', 'Andhra Pradesh', '221', 'Badava', '35.800', 'Imbrahimpatnam to AP Telangana Border', '4486']
2025-04-24 16:40:07 - INFO - ['5', 'Andhra Pradesh', 'NH40', 'Bandaplli', '119.945 Bandaplli', 'Rayachoty Kadapa Section', '5697']
2025-04-24 16:40:07 - INFO - ['6', 'Andhra Pradesh', 'NH42', 'Bandlapalli', 'Bandlapalli village', 'Madanapalli to punganur to palmner', '5952']
2025-04-24 16:40:07 - INFO - ['7', 'Andhra Pradesh', 'NH 67', 'Basapuram', 'Km 604.450', 'Km 589.000 to Km

# Write rows to the output file

In [16]:
for idx in range(0,len(extracted_rows)):
    write_to_file(extracted_rows[idx])

In [17]:
logging.info("Collecting list of all toll plazas complete")

2025-04-24 16:41:45 - INFO - Collecting list of all toll plazas complete


# Replace all occurrence of ; with , which was introduced to prepare the comma separated output file

In [18]:
df=pd.read_csv(OUTPUT_FILE_NAME,encoding='cp1252')
df = df.replace(";", ",", regex=True)
df.to_csv(OUTPUT_FILE_NAME,index=False)

In [19]:
# df.head()

# Add lat and long to the toll plaza name using Google Maps API

In [20]:
import requests

In [21]:
# GOOGLE_API_KEY="ENTER_GOOGLE_API_KEY"
GOOGLE_API_KEY="AIzaSyBBgZATqp-TE3cTtK6F7J8Q7zXm7Z4ueV8"

In [22]:
def get_coordinates(address):
    api_key = GOOGLE_API_KEY # Replace with your Google API key
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={api_key}"
    response = requests.get(url).json()
    # print(response)
    if response['status']=='OK':
        latitude = response['results'][0]['geometry']['location']['lat']
        longitude = response['results'][0]['geometry']['location']['lng']
        place_id = response['results'][0]['place_id']
        try:
            address_partial_match = response['results'][0]['partial_match']
        except:
            address_partial_match = False        
        return latitude, longitude,place_id,address_partial_match
    else:
        logging.debug(response)
        return None,None,None,None

In [23]:
# get_coordinates("Aganampudi toll plaza,Andhra Pradesh") #response['results'][0]['geometry']['location']['lat']

In [24]:
# get_coordinates("Aganampudi toll plaza,Andhra Pradesh")  
#{'error_message': 'This API key is not authorized to use this service or API.', 'results': [], 'status': 'REQUEST_DENIED'}

In [25]:
# {'results': [], 'status': 'ZERO_RESULTS'}

In [26]:
df.shape[0]

1051

In [27]:
for idx in range(0,df.shape[0]):
    state_name=df.iloc[idx]['State']
    toll_name= df.iloc[idx]['Toll Plaza Name']
    if "toll" not in toll_name.lower():
        toll_name = toll_name + " toll plaza"
    address = toll_name + ", "+state_name
    latitude, longitude,place_id,address_partial_match=get_coordinates(address)
    # logging.info(idx, address,latitude,longitude,place_id,address_partial_match)
    print(idx, address,latitude, longitude,place_id,address_partial_match)
    df.loc[idx,'latitude']=latitude
    df.loc[idx,'longitude']=longitude
    df.loc[idx,'place_id']=place_id
    df.loc[idx,'address_partial_match']=address_partial_match

0 Aganampudi toll plaza, Andhra Pradesh 17.6854173 83.14995069999999 ChIJyaj65ttuOToRH26fq-li1fY False
1 Amakathadu toll plaza, Andhra Pradesh 15.4864765 77.9009433 ChIJG6ZfoqN_tjsR6hw94uU86QQ False
2 Annampalli toll plaza, Andhra Pradesh 16.6722283 82.1473895 ChIJcw8uBaz1NzoRMva9Ii5taRE False
3 Badava toll plaza, Andhra Pradesh 16.8504745 80.63358439999999 ChIJr0zTpNLaNToReLaGbihNfUU False
4 Bandaplli toll plaza, Andhra Pradesh 14.1311987 78.75668209999999 ChIJ2T3B-NwFszsRs_e_vnu63yw True
5 Bandlapalli toll plaza, Andhra Pradesh 14.1311987 78.75668209999999 ChIJ2T3B-NwFszsRs_e_vnu63yw False
6 Basapuram toll plaza, Andhra Pradesh 14.7083358 78.8723771 ChIJ4_R8zQRjszsRkEOmZw1aqqk False
7 Bathalapalli toll plaza, Andhra Pradesh 14.5001074 77.7966071 ChIJid-qWD5bsTsRNqTOBrQzN6Q False
8 Bellupada toll plaza, Andhra Pradesh 19.1113097 84.6992649 ChIJVyVbW_1pPToRAqCixwnTQPg False
9 Bollapalli toll plaza, Andhra Pradesh 15.8865647 80.07081560000002 ChIJ823aWrD1SjoRnBgUBxXInl4 False
10 Brahman

In [28]:
# get_coordinates("Surjapur toll plaza, West Bengal")

In [29]:
df.shape

(1051, 11)

In [30]:
df.to_csv(OUTPUT_FILE_NAME,index=False)

In [31]:
df=pd.read_csv(OUTPUT_FILE_NAME,encoding='cp1252')

In [32]:
end_time = datetime.datetime.now()
time_taken = end_time - start_time
format="%y-%m-%d %H:%M:%S"
print(f"Start Time = {start_time.strftime(format)}")
print(f"End Time = {end_time.strftime(format)}")
print(f"Total Execution time = {round(time_taken.total_seconds(),2)} sec")

Start Time = 25-04-24 16:39:24
End Time = 25-04-24 16:46:16
Total Execution time = 411.87 sec


In [36]:
# df.head()

In [35]:
# df.columns