In [4]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

def first_page():
    elements = driver.find_elements(By.CSS_SELECTOR, "a[class='D113_link']")
    return [(e.text, e.get_attribute('href')) for e in elements]

def second_page(link):
    driver.get(link)
    driver.maximize_window()
    time.sleep(2)  # Ensure page is fully loaded
    routes = [(r.text, r.get_attribute('href')) for r in driver.find_elements(By.CSS_SELECTOR, "a[class='route']")]

    # Go through all available pages if present
    pages = driver.find_elements(By.CSS_SELECTOR, "div[class='DC_117_pageTabs ']")
    for p in pages:
        ActionChains(driver).click(p).perform()
        time.sleep(3)
        new_routes = driver.find_elements(By.CSS_SELECTOR, "a[class='route']")
        routes.extend((r.text, r.get_attribute('href')) for r in new_routes)
    return routes

def final_page_scraping(name, link):
    driver.get(link)
    driver.maximize_window()
    time.sleep(1)
    
    # Click on buttons if necessary
    buttons = driver.find_elements(By.CSS_SELECTOR, "div[class='button']")
    for button in reversed(buttons):
        button.click()

    # Scroll to ensure all elements are loaded
    for _ in range(10):  # Adjust the range for longer scrolling
        driver.execute_script('window.scrollBy(0, 1000)')
        time.sleep(0.5)
    
    # Extract bus information
    bus_data = []
    bus_name = driver.find_elements(By.CSS_SELECTOR, "div[class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.CSS_SELECTOR, "div[class='bus-type f-12 m-top-16 l-color evBus']")
    bus_dept = driver.find_elements(By.CSS_SELECTOR, "div[class='dp-time f-19 d-color f-bold']")
    bus_dur = driver.find_elements(By.CSS_SELECTOR, "div[class='dur l-color lh-24']")
    bus_reach = driver.find_elements(By.CSS_SELECTOR, "div[class='column-five p-right-10 w-10 fl']")
    bus_star = driver.find_elements(By.CSS_SELECTOR, "div[class='column-six p-right-10 w-10 fl']")
    bus_price = driver.find_elements(By.CSS_SELECTOR, "div[class='fare d-block']")
    bus_seats = driver.find_elements(By.CSS_SELECTOR, "div[class='column-eight w-15 fl']")

    for i in range(len(bus_name)):
        bus_data.append([
            name, link, bus_name[i].text, bus_type[i].text, bus_dept[i].text,
            bus_dur[i].text, bus_reach[i].text.split('\n')[0], bus_star[i].text.split('\n')[0],
            bus_price[i].text, bus_seats[i].text.split('\n')[0]
        ])
    
    return bus_data

# Initialize WebDriver
driver = webdriver.Chrome()
driver.get('https://www.redbus.in/online-booking/rtc-directory')
driver.maximize_window()
time.sleep(5)

result = []
f_page = first_page()
for name, link in f_page:
    second_page_data = second_page(link)
    for route_name, route_link in second_page_data:
        bus_info = final_page_scraping(route_name, route_link)
        result.extend(bus_info)

# DataFrame creation and saving to Excel
columns = ['Bus Route Name', 'Bus route link', 'Bus Name', 'Bus Type', 'Departing time', 'Duration', 'Reaching Time', 'Rating', 'Price', 'Seats available']
df = pd.DataFrame(result, columns=columns).drop_duplicates()

# Clean and format the DataFrame
df['Departing time'] = pd.to_datetime(df['Departing time'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
df['Reaching Time'] = pd.to_datetime(df['Reaching Time'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
df['Price'] = df['Price'].str.replace('INR ', '', regex=False).astype(float)
df['Seats available'] = df['Seats available'].str.extract('(\d+)').astype(float)
df['Rating'] = pd.to_numeric(df['Rating'].str.replace('New', '').str.strip(), errors='coerce').fillna(0)

# Save to Excel
df.to_excel("output_redbus.xlsx", index=False)
print("Data extracted and saved to output_redbus.xlsx")

# Close the browser
driver.quit()


  df['Seats available'] = df['Seats available'].str.extract('(\d+)').astype(float)
  df['Departing time'] = pd.to_datetime(df['Departing time'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
  df['Reaching Time'] = pd.to_datetime(df['Reaching Time'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')


Data extracted and saved to output_redbus.xlsx


In [5]:
df

Unnamed: 0,Bus Route Name,Bus route link,Bus Name,Bus Type,Departing time,Duration,Reaching Time,Rating,Price,Seats available
0,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...,RSRTC - 193271,Express Non AC Seater 2+3,2024-10-04 19:30:00,06h 30m,2024-10-04 02:00:00,4.2,294.0,34.0
1,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...,RSRTC - 189622,Express Non AC Seater 2+3,2024-10-04 22:00:00,07h 00m,2024-10-04 05:00:00,3.5,294.0,38.0
2,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...,Shree Devnarayan Travels,Bharat Benz A/C Seater (2+1),2024-10-04 17:00:00,04h 30m,2024-10-04 21:30:00,4.5,550.0,3.0
3,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...,Raj Travel,A/C Sleeper (2+1),2024-10-04 22:30:00,07h 00m,2024-10-04 05:30:00,4.7,560.0,12.0
4,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...,M R Travels,Volvo Multi-Axle I-Shift B11R Semi Sleeper (2+2),2024-10-04 17:10:00,04h 45m,2024-10-04 21:55:00,4.5,612.0,25.0
...,...,...,...,...,...,...,...,...,...,...
9005,North Lakhimpur to Guwahati,https://www.redbus.in/bus-tickets/north-lakhim...,Yatra Travels,NON AC Seater/ Sleeper (2+1),2024-10-04 19:45:00,10h 30m,2024-10-04 06:15:00,4.2,500.0,26.0
9013,North Lakhimpur to Guwahati,https://www.redbus.in/bus-tickets/north-lakhim...,Yatra Travels,NON AC Seater/ Sleeper (2+1),2024-10-04 20:30:00,09h 30m,2024-10-04 06:00:00,3.9,550.0,19.0
9038,Guwahati to Golaghat,https://www.redbus.in/bus-tickets/guwahati-to-...,NAMBOR TRANSPORT,NON A/C Seater (2+1),2024-10-04 20:30:00,06h 45m,2024-10-04 03:15:00,3.5,500.0,13.0
9039,Guwahati to Golaghat,https://www.redbus.in/bus-tickets/guwahati-to-...,ZAMZAM TRAVELS,NON A/C Seater (2+1),2024-10-04 20:15:00,07h 15m,2024-10-04 03:30:00,3.9,500.0,14.0
