In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException

# Initialize Chrome WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

# Read URLs from CSV into a DataFrame, including bus_link and bus_route
urls_df = pd.read_csv(r'C:\Users\srip1\OneDrive\Desktop\guvi\project_1\redbus_dataset.csv', usecols=['bus_link', 'bus_route'])

# Initialize a list to store all DataFrames
all_dfs = []

# Loop through each URL in the DataFrame
for i, row in urls_df.iterrows():
    url = str(row['bus_link']).strip()  # Convert to string and strip whitespace
    bus_route = row['bus_route']

    try:
        # Check if URL is valid (not NaN or malformed)
        if not url or url.lower() == 'nan':
            print(f"Skipping invalid URL at index {i}: {url}")
            continue

        # Navigate to the URL
        driver.get(url)
        print(f"Scraping data from: {url}")

        # Wait for the page to load (adjust as needed)
        time.sleep(5)

        # Scroll down to the bottom of the page to load all content
        actions = ActionChains(driver)
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            actions.send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract the web elements
        results = driver.find_elements(By.XPATH, '//div[@class="clearfix bus-item"]')

        # Initialize an empty list to store the results for the current URL
        data_list = []

        for result in results:
            try:
                bus_name = result.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text.strip()
            except:
                bus_name = None

            try:
                bus_type = result.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text.strip()
            except:
                bus_type = None

            try:
                depart_time = result.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text.strip()
            except:
                depart_time = None

            try:
                arr_time = result.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text.strip()
            except:
                arr_time = None

            try:
                dur = result.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text.strip()
            except:
                dur = None

            try:
                fare = result.find_element(By.CSS_SELECTOR, 'span.f-19.f-bold').text.strip()
            except:
                fare = None

            try:
                rating = result.find_element(By.XPATH, './/div[contains(@class, "rating-sec") and contains(@class, "lh-24")]').text.strip()
            except:
                rating = None

            try:
                seat_availability = result.find_element(By.XPATH, './/div[contains(@class, "seat-left") and contains(@class, "m-top-30")]').text.strip().split()[0]
            except:
                seat_availability = None

            # Append data as a tuple to the list, including bus_link and bus_route
            data_list.append((bus_name, bus_type, depart_time, arr_time, dur, fare, rating, seat_availability, url, bus_route))

        # Create DataFrame from the list of tuples for the current URL
        df = pd.DataFrame(data_list, columns=["Bus_name", "Bus Type", "Departure Time", "Arrival Time", "Duration", "Fare", "Rating", "Seat_Availability", "Bus Link", "Bus Route"])

        # Append the DataFrame to the list of all DataFrames
        all_dfs.append(df)

    except WebDriverException as e:
        print(f"Error navigating to URL: {url}")
        print(str(e))  # Print the exception details for debugging purposes

# Close the Selenium driver
driver.quit()

# Concatenate all DataFrames in the list into a single DataFrame
total_results = pd.concat(all_dfs, ignore_index=True)

# Save the concatenated DataFrame to a CSV file
total_results.to_csv('redbus_all_red.csv', index=False)

# Print the final concatenated DataFrame
print(total_results)


Scraping data from: https://www.redbus.in/bus-tickets/pune-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pune
Scraping data from: https://www.redbus.in/bus-tickets/mumbai-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-mumbai
Scraping data from: https://www.redbus.in/bus-tickets/pandharpur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/bangalore-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pandharpur
Scraping data from: https://www.redbus.in/bus-tickets/belagavi-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/solapur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-kolhapur-maharashtra
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-solapur
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-sangola
Scraping data from: https://www.redbus.in/bus-tickets/sangola-to-goa
Scraping data from

In [2]:
total_results

Unnamed: 0,Bus_name,Bus Type,Departure Time,Arrival Time,Duration,Fare,Rating,Seat_Availability,Bus Link,Bus Route
0,Kadamba Transport Corporation Limited (KTCL) -...,Non AC Seater 2+2,19:00,05:00,10h 00m,600,4.2,25,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
1,Atmaram Gobus,VE A/C Sleeper (2+1),21:00,07:30,10h 30m,999,4.6,1,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
2,Ashray Travels,Bharat Benz A/C Sleeper (2+1),21:00,08:30,11h 30m,950,4.5,11,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
3,Ashray Amrutyog,A/C Sleeper (2+1),22:00,08:00,10h 00m,799,4.4,13,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
4,IntrCity SmartBus,AC Sleeper (2+1),23:50,10:00,10h 10m,676,4.5,14,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
...,...,...,...,...,...,...,...,...,...,...
835,City Land Travels,Volvo A/C Semi Sleeper (2+2),19:30,01:30,06h 00m,656,3.5,11,https://www.redbus.in/bus-tickets/palampur-to-...,Palampur to Chandigarh
836,New Himalaya Travels,Volvo A/C Semi Sleeper (2+2),19:35,01:30,05h 55m,720,4.8,8,https://www.redbus.in/bus-tickets/palampur-to-...,Palampur to Chandigarh
837,Laxmi holidays,,,,,,,,https://www.redbus.in/bus-tickets/palampur-to-...,Palampur to Chandigarh
838,,,,,,,,,https://www.redbus.in/bus-tickets/palampur-to-...,Palampur to Chandigarh
