In [1]:
import pandas as pd

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time

driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)  # Increased timeout

driver.get("https://www.redbus.in/online-booking/tsrtc/")

TSRTC = []

def scrape_page():
    # Locate elements  (container)
    routes = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "route_link")))

    # Loop through each route to extract details
    for route in routes:
        try:
            # Extract route name and link from the route element
            route_name_element = route.find_element(By.XPATH, ".//div[@class='route_details']")
            route_link_element = route.find_element(By.XPATH, ".//a")

            route_name = route_name_element.text
            route_link = route_link_element.get_attribute('href')

            # Append extracted data to list
            TSRTC.append({
                'route_name': route_name,
                'route_link': route_link
            })

        except Exception as e:
            print(f"An error occurred: {e}")
            continue

# Scrape data from the first 5 pages
for page_number in range(1, 4):
    scrape_page()
    if page_number < 3:  # Don't try to click next on the last page
        try:
            # Locate the pagination container
            pagination_container = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
            ))

            # Locate the next page button within the container
            next_page_button = pagination_container.find_element(
                By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
            )

            # Ensure the next page button is in view
            actions = ActionChains(driver)
            actions.move_to_element(next_page_button).perform()
            time.sleep(1)  # Wait for a bit after scrolling

            # Log the action
            print(f"Clicking on page {page_number + 1}")

            # Click the next page button
            next_page_button.click()

            # Wait for the page number to update to the next page
            wait.until(EC.text_to_be_present_in_element(
                (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)))

            # Log the successful page navigation
            print(f"Successfully navigated to page {page_number + 1}")

            # Wait for a short duration to ensure the next page loads completely
            time.sleep(3)
        except Exception as e:
            print(f"An error occurred while navigating to page {page_number + 1}: {e}")
            break

# Print the scraped data
for entry in TSRTC:
    print(entry)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(TSRTC)
df.to_csv("tsrtc_details.csv", index=False)

# Close the driver
driver.quit()

df.head()  # Display the first few rows of the DataFrame


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
{'route_name': 'Khammam to Hyderabad\nFrom INR 281', 'route_link': 'https://www.redbus.in/bus-tickets/khammam-to-hyderabad'}
{'route_name': 'Hyderabad to Vijayawada\nFrom INR 396', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada'}
{'route_name': 'Hyderabad to Khammam\nFrom INR 281', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-khammam'}
{'route_name': 'Hyderabad to Srisailam\nFrom INR 390', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-srisailam'}
{'route_name': 'Karimnagar to Hyderabad\nFrom INR 283', 'route_link': 'https://www.redbus.in/bus-tickets/karimnagar-to-hyderabad'}
{'route_name': 'Hyderabad to Nirmal\nFrom INR 376', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-nirmal'}
{'route_name': 'Hyderabad to Mancherial\nFrom INR 429', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-mancherial'}
{

Unnamed: 0,route_name,route_link
0,Khammam to Hyderabad\nFrom INR 281,https://www.redbus.in/bus-tickets/khammam-to-h...
1,Hyderabad to Vijayawada\nFrom INR 396,https://www.redbus.in/bus-tickets/hyderabad-to...
2,Hyderabad to Khammam\nFrom INR 281,https://www.redbus.in/bus-tickets/hyderabad-to...
3,Hyderabad to Srisailam\nFrom INR 390,https://www.redbus.in/bus-tickets/hyderabad-to...
4,Karimnagar to Hyderabad\nFrom INR 283,https://www.redbus.in/bus-tickets/karimnagar-t...


In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Initialize the webdriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 90)

# Define the CSV file path and read URLs
csv_file_path = r"C:\Users\sonur\OneDrive\Documents\vs code\Details\tsrtc_details.csv"
try:
    url_df = pd.read_csv(csv_file_path, on_bad_lines='skip')  # Skip bad lines to avoid errors
    urls = url_df['route_link'].tolist()  # Column name in the CSV is 'route_link'
    print(f"URLs found: {len(urls)}")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    urls = []

# Define a list to store all bus details data
bus_details = []

# Function to scrape bus details from the URL
def scrape_bus_details(url):
    print(f"Scraping data from: {url}")
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    try:
        bus_containers = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".clearfix.bus-item-details")))
        for container in bus_containers:
            try:
                # Extract bus details
                bus_name = container.find_element(By.CSS_SELECTOR, '.travels.lh-24.f-bold.d-color').text.strip() or 'N/A'
                bus_type = container.find_element(By.CSS_SELECTOR, '.bus-type.f-12.m-top-16.l-color.evBus').text.strip() or 'N/A'
                departing_time = container.find_element(By.CSS_SELECTOR, '.dp-time.f-19.d-color.f-bold').text.strip() or 'N/A'
                duration = container.find_element(By.CSS_SELECTOR, '.dur.l-color.lh-24').text.strip() or 'N/A'
                reaching_time = container.find_element(By.CSS_SELECTOR, '.bp-time.f-19.d-color.disp-Inline').text.strip() or 'N/A'
                price = container.find_element(By.CSS_SELECTOR, '.fare.d-block span.f-19.f-bold').text.strip() or 'N/A'
                
                # Attempt to find the star rating element
                try:
                    star_rating = container.find_element(By.CSS_SELECTOR, '.rating-sec .rating span').text.strip() or 'N/A'
                except NoSuchElementException:
                    star_rating = 'N/A'
                
                seat_availability = container.find_element(By.CSS_SELECTOR, '.seat-left').get_attribute('innerText').strip() or 'N/A'
               
                bus_details.append({
                    "bus_name": bus_name,
                    "bus_type": bus_type,
                    "departure_time": departing_time,
                    "duration": duration,
                    "arrival_time": reaching_time,
                    "price": price,
                    "star_rating": star_rating,
                    "seat_availability": seat_availability
                })
            except NoSuchElementException as e:
                print(f"Element not found: {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing a bus container: {e}")
    except TimeoutException:
        print(f"Timed out waiting for bus containers on page: {url}")
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")

# Iterate through each URL and scrape data
for url in urls:
    scrape_bus_details(url)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(bus_details)
df.to_csv("tsrtc_alldetails.csv", index=False)

# Print the scraped bus details
print(df.head() if not df.empty else "No data to display")

# Close the driver
driver.quit()


URLs found: 26
Scraping data from: https://www.redbus.in/bus-tickets/khammam-to-hyderabad
Timed out waiting for bus containers on page: https://www.redbus.in/bus-tickets/khammam-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-khammam
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-srisailam
Scraping data from: https://www.redbus.in/bus-tickets/karimnagar-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-nirmal
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-mancherial
Timed out waiting for bus containers on page: https://www.redbus.in/bus-tickets/hyderabad-to-mancherial
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-adilabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-karimnagar
Scraping data from: https://www.redbus.in/bus-tickets/kothagudem-to-hyderabad
Scraping data