In [1]:
import pandas as pd

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time

driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)  # Increased timeout

driver.get("https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile")

APSRTC = []

def scrape_page():
    # Locate elements  (container)
    routes = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "route_link")))

    # Loop through each route to extract details
    for route in routes:
        try:
            # Extract route name and link from the route element
            route_name_element = route.find_element(By.XPATH, ".//div[@class='route_details']")
            route_link_element = route.find_element(By.XPATH, ".//a")

            route_name = route_name_element.text
            route_link = route_link_element.get_attribute('href')

            # Append extracted data to list
            APSRTC.append({
                'route_name': route_name,
                'route_link': route_link
            })

        except Exception as e:
            print(f"An error occurred: {e}")
            continue

# Scrape data from the first 5 pages
for page_number in range(1, 6):
    scrape_page()
    if page_number < 5:  # Don't try to click next on the last page
        try:
            # Locate the pagination container
            pagination_container = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
            ))

            # Locate the next page button within the container
            next_page_button = pagination_container.find_element(
                By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
            )

            # Ensure the next page button is in view
            actions = ActionChains(driver)
            actions.move_to_element(next_page_button).perform()
            time.sleep(1)  # Wait for a bit after scrolling

            # Log the action
            print(f"Clicking on page {page_number + 1}")

            # Click the next page button
            next_page_button.click()

            # Wait for the page number to update to the next page
            wait.until(EC.text_to_be_present_in_element(
                (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)))

            # Log the successful page navigation
            print(f"Successfully navigated to page {page_number + 1}")

            # Wait for a short duration to ensure the next page loads completely
            time.sleep(3)
        except Exception as e:
            print(f"An error occurred while navigating to page {page_number + 1}: {e}")
            break

# Print the scraped data
for entry in APSRTC:
    print(entry)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(APSRTC)
df.to_csv("apsrtc_details.csv", index=False)

# Close the driver
driver.quit()

df.head()  # Display the first few rows of the DataFrame


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4
Clicking on page 5
Successfully navigated to page 5
{'route_name': 'Vijayawada to Hyderabad\nFrom INR 414', 'route_link': 'https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad'}
{'route_name': 'Hyderabad to Vijayawada\nFrom INR 367', 'route_link': 'https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada'}
{'route_name': 'Kakinada to Visakhapatnam\nFrom INR 239', 'route_link': 'https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam'}
{'route_name': 'Visakhapatnam to Kakinada\nFrom INR 239', 'route_link': 'https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada'}
{'route_name': 'Chittoor (Andhra Pradesh) to Bangalore\nFrom INR 186', 'route_link': 'https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore'}
{'route_name': 'Kadapa to Bangalore\nFrom INR 412', 'route_link': 'https://www.redbus.in/bus-tickets/

Unnamed: 0,route_name,route_link
0,Vijayawada to Hyderabad\nFrom INR 414,https://www.redbus.in/bus-tickets/vijayawada-t...
1,Hyderabad to Vijayawada\nFrom INR 367,https://www.redbus.in/bus-tickets/hyderabad-to...
2,Kakinada to Visakhapatnam\nFrom INR 239,https://www.redbus.in/bus-tickets/kakinada-to-...
3,Visakhapatnam to Kakinada\nFrom INR 239,https://www.redbus.in/bus-tickets/visakhapatna...
4,Chittoor (Andhra Pradesh) to Bangalore\nFrom I...,https://www.redbus.in/bus-tickets/chittoor-and...


In [8]:
import pandas as pd

# Load the CSV file containing route links
route_data = pd.read_csv('apsrtc_details.csv')

# Extract the route links into a list
route_links = route_data['route_link'].tolist()

# Check the first few links to confirm they are correct
print(route_links[:10])


['https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad', 'https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada', 'https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam', 'https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada', 'https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore', 'https://www.redbus.in/bus-tickets/kadapa-to-bangalore', 'https://www.redbus.in/bus-tickets/ananthapur-to-bangalore', 'https://www.redbus.in/bus-tickets/tirupathi-to-bangalore', 'https://www.redbus.in/bus-tickets/visakhapatnam-to-vijayawada', 'https://www.redbus.in/bus-tickets/ongole-to-hyderabad']


In [1]:
import pandas as pd

In [4]:
pip install --upgrade selenium

Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.4 MB 330.3 kB/s eta 0:00:29
   ---------------------------------------- 0.1/9.4 MB 825.8 kB/s eta 0:00:12
   - -------------------------------------- 0.3/9.4 MB 2.4 MB/s eta 0:00:04
   -- ------------------------------------- 0.6/9.4 MB 4.2 MB/s eta 0:00:03
   --- ------------------------------------ 0.8/9.4 MB 4.5 MB/s eta 0:00:02
   ----- ---------------------------------- 1.3/9.4 MB 6.1 MB/s eta 0:00:02
   ------- -------------------------------- 1.7/9.4 MB 6.3 MB/s eta 0:00:02
   -------- ------------------------------- 2.0/9.4 MB 6.4 MB/s eta 0:00:02
   --------- ------------------------------ 2.3/9.4 MB 6.4 MB/s eta 0:00:02
   ----------- -----------


[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Initialize the webdriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)

# Define the URL to scrape
url = "https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad"

# Define a list to store all bus details data
bus_details = []

# Function to scrape bus details from the URL
def scrape_bus_details(url):
    print(f"Scraping data from: {url}")
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    try:
        bus_containers = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".clearfix.bus-item-details")))
        for container in bus_containers:
            try:
                # Extract bus details
                bus_name = container.find_element(By.CSS_SELECTOR, '.travels.lh-24.f-bold.d-color').text.strip() or 'N/A'
                bus_type = container.find_element(By.CSS_SELECTOR, '.bus-type.f-12.m-top-16.l-color.evBus').text.strip() or 'N/A'
                departing_time = container.find_element(By.CSS_SELECTOR, '.dp-time.f-19.d-color.f-bold').text.strip() or 'N/A'
                duration = container.find_element(By.CSS_SELECTOR, '.dur.l-color.lh-24').text.strip() or 'N/A'
                reaching_time = container.find_element(By.CSS_SELECTOR, '.bp-time.f-19.d-color.disp-Inline').text.strip() or 'N/A'
                price = container.find_element(By.CSS_SELECTOR, '.fare.d-block span.f-19.f-bold').text.strip() or 'N/A'
                
                # Attempt to find the star rating element
                try:
                    star_rating = container.find_element(By.CSS_SELECTOR, '.rating-sec .rating span').text.strip() or 'N/A'
                except NoSuchElementException:
                    star_rating = 'N/A'
                
                seat_availability = container.find_element(By.CSS_SELECTOR, '.seat-left').get_attribute('innerText').strip() or 'N/A'
               
                bus_details.append({
                    "bus_name": bus_name,
                    "bus_type": bus_type,
                    "departure_time": departing_time,
                    "duration": duration,
                    "arrival_time": reaching_time,
                    "price": price,
                    "star_rating": star_rating,
                    "seat_availability": seat_availability
                })
            except NoSuchElementException as e:
                print(f"Element not found: {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing a bus container: {e}")
    except TimeoutException:
        print(f"Timed out waiting for bus containers on page: {url}")
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")

# Scrape bus details from the URL
scrape_bus_details(url)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(bus_details)
df.to_csv("apsrtc_alldetails.csv", index=False)

# Print the scraped bus details
for entry in bus_details:
    print(entry)

# Close the driver
driver.quit()

df.head()  # Display the first few rows of the DataFrame


Scraping data from: https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad
{'bus_name': 'IntrCity SmartBus', 'bus_type': 'A/C Seater / Sleeper (2+1)', 'departure_time': '23:45', 'duration': '06h 15m', 'arrival_time': '06:00', 'price': '520', 'star_rating': '4.6', 'seat_availability': '33 Seats available'}
{'bus_name': 'FRESHBUS', 'bus_type': 'Electric A/C Seater (2+2)', 'departure_time': '05:00', 'duration': '07h 05m', 'arrival_time': '12:05', 'price': '450', 'star_rating': '4.7', 'seat_availability': '34 Seats available'}
{'bus_name': 'FRESHBUS', 'bus_type': 'Electric A/C Seater (2+2)', 'departure_time': '22:30', 'duration': '06h 50m', 'arrival_time': '05:20', 'price': '450', 'star_rating': '4.6', 'seat_availability': '25 Seats available'}
{'bus_name': 'N/A', 'bus_type': 'N/A', 'departure_time': 'N/A', 'duration': 'N/A', 'arrival_time': 'N/A', 'price': 'N/A', 'star_rating': 'N/A', 'seat_availability': '27 Seats available'}
{'bus_name': 'N/A', 'bus_type': 'N/A', 'departure_time': 'N

Unnamed: 0,bus_name,bus_type,departure_time,duration,arrival_time,price,star_rating,seat_availability
0,IntrCity SmartBus,A/C Seater / Sleeper (2+1),23:45,06h 15m,06:00,520.0,4.6,33 Seats available
1,FRESHBUS,Electric A/C Seater (2+2),05:00,07h 05m,12:05,450.0,4.7,34 Seats available
2,FRESHBUS,Electric A/C Seater (2+2),22:30,06h 50m,05:20,450.0,4.6,25 Seats available
3,,,,,,,,27 Seats available
4,,,,,,,,41 Seats available


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Initialize the webdriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)

# Define the CSV file path and read URLs
csv_file_path = r"C:\Users\sonur\OneDrive\Documents\vs code\Details\apsrtc_details.csv"
try:
    url_df = pd.read_csv(csv_file_path, on_bad_lines='skip')  # Skip bad lines to avoid errors
    urls = url_df['route_link'].tolist()  # Column name in the CSV is 'route_link'
    print(f"URLs found: {len(urls)}")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    urls = []

# Define a list to store all bus details data
bus_details = []

# Function to scrape bus details from the URL
def scrape_bus_details(url):
    print(f"Scraping data from: {url}")
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    try:
        bus_containers = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".clearfix.bus-item-details")))
        for container in bus_containers:
            try:
                # Extract bus details
                bus_name = container.find_element(By.CSS_SELECTOR, '.travels.lh-24.f-bold.d-color').text.strip() or 'N/A'
                bus_type = container.find_element(By.CSS_SELECTOR, '.bus-type.f-12.m-top-16.l-color.evBus').text.strip() or 'N/A'
                departing_time = container.find_element(By.CSS_SELECTOR, '.dp-time.f-19.d-color.f-bold').text.strip() or 'N/A'
                duration = container.find_element(By.CSS_SELECTOR, '.dur.l-color.lh-24').text.strip() or 'N/A'
                reaching_time = container.find_element(By.CSS_SELECTOR, '.bp-time.f-19.d-color.disp-Inline').text.strip() or 'N/A'
                price = container.find_element(By.CSS_SELECTOR, '.fare.d-block span.f-19.f-bold').text.strip() or 'N/A'
                
                # Attempt to find the star rating element
                try:
                    star_rating = container.find_element(By.CSS_SELECTOR, '.rating-sec .rating span').text.strip() or 'N/A'
                except NoSuchElementException:
                    star_rating = 'N/A'
                
                seat_availability = container.find_element(By.CSS_SELECTOR, '.seat-left').get_attribute('innerText').strip() or 'N/A'
               
                bus_details.append({
                    "bus_name": bus_name,
                    "bus_type": bus_type,
                    "departure_time": departing_time,
                    "duration": duration,
                    "arrival_time": reaching_time,
                    "price": price,
                    "star_rating": star_rating,
                    "seat_availability": seat_availability
                })
            except NoSuchElementException as e:
                print(f"Element not found: {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing a bus container: {e}")
    except TimeoutException:
        print(f"Timed out waiting for bus containers on page: {url}")
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")

# Iterate through each URL and scrape data
for url in urls:
    scrape_bus_details(url)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(bus_details)
df.to_csv("apsrtc_alldetails.csv", index=False)

# Print the scraped bus details
print(df.head() if not df.empty else "No data to display")

# Close the driver
driver.quit()


URLs found: 49
Scraping data from: https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada
Scraping data from: https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam
Scraping data from: https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada
Scraping data from: https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/kadapa-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/ananthapur-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/tirupathi-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/visakhapatnam-to-vijayawada
Scraping data from: https://www.redbus.in/bus-tickets/ongole-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/bangalore-to-tirupathi
Scraping data from: https://www.redbus.in/bus-tickets/macherla-to-hyderabad
Scraping data from: https://www.redbus.i