In [14]:
pip install pandas selenium mysql-connector-python sqlalchemy streamlit





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import time

In [7]:
#read the csv file
df = pd.read_csv(r'D:\redbus -REDBUS\01_bus_routes.csv')

#store scraped data
all_bus_details = []

#XPATH
xpath_bus_details = '//div[@class="clearfix row-one"]' # this xpath targets the data needs to be scrapped

In [9]:
#create a chrome driver instance
def initialize_chromedriver():
    try:
        driver = webdriver.Chrome()
        return driver
    except Exception as e:
        print(f"Error initializing WebDriver: {e}")
        sys.exit(1)

In [10]:
#Helper function to select view_buses
def view_buses(driver):
    xpath_view_buses = '//div[@class="button" and text()="View Buses"]'
    
    try:
        wait = WebDriverWait(driver,5)
        object_viewbuses = wait.until(EC.presence_of_all_elements_located((By.XPATH,xpath_view_buses)))
    except (NoSuchElementException,TimeoutException):
        return
    
    if object_viewbuses:
        for element in reversed(object_viewbuses): #reversed() to click button from bottom so, following buttons will be in sight
                    try:
                        time.sleep(2)
                        element.click()
                        time.sleep(2) # 2 sec wait to load dynamic content
                    except Exception:
                        continue

In [11]:
#helper function to scroll bottom of page dynamically
def scroll_to_bottom(driver):
    while True:
        # get initial page height
        current_page_height = driver.execute_script("return document.body.scrollHeight")

        #scroll to bottom of page
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        
        # Wait for the page to load new content...if any
        time.sleep(1)

        #get new height after scrolling page
        currentpage_new_height = driver.execute_script("return document.body.scrollHeight") 

        if currentpage_new_height == current_page_height:
            break
        
        current_page_height = currentpage_new_height

In [13]:
# for each iteration it scrapes current page data
def scrape_current_page(state, route, link, elements_data):
    bus_details = []
    for element in elements_data:
        try:
            bus_name = element.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text
        except NoSuchElementException:
            bus_name = "NA"

        try:
            bus_type = element.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text
        except NoSuchElementException:
            bus_type = "NA"

        try:
            departing_time = element.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text
        except NoSuchElementException:
            departing_time = "NA"

        try:
            duration = element.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text
        except NoSuchElementException:
            duration = "NA"

        try:
            reaching_time = element.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text
        except NoSuchElementException:
            reaching_time = "NA"

        try:
            price = element.find_element(By.XPATH, './/span[contains(@class, "f-19 f-bold") or contains(@class, "f-bold f-19")]').text
        except NoSuchElementException:
            price = "NA"

        try:
            star_rating = element.find_element(By.XPATH, './/div[@class="rating-sec lh-24"]//span').text
        except NoSuchElementException:
            try:
                element.find_element(By.XPATH, './/span[contains(@class, "blue rating_badge")]')
                star_rating = '0'
            except NoSuchElementException:
                star_rating = '0'

        try:
            seat_available = element.find_element(By.XPATH, './/div[contains(@class, "seat-left") and (contains(@class, "m-top-16") or contains(@class, "m-top-30"))]').text
        except NoSuchElementException:
            seat_available = "NA"

        bus_details.append([state, route, link, bus_name, bus_type, departing_time, duration, reaching_time, price, star_rating, seat_available])
    
    return bus_details

In [14]:
#check whether buses found in page
def check_oops_message(driver):
    check_xpath = (
        "//div[(contains(@class, 'oops-wrapper') and //h3[text()='Oops! No buses found.']) or "
        "(@class='oops-wrapper new_oops_wrapper' and text()='Oops! No buses found.')]"
    )
    try:
        wait = WebDriverWait(driver,3)
        wait.until(EC.presence_of_element_located((By.XPATH, check_xpath)))
        return True
    except (NoSuchElementException,TimeoutException):
        return False#Main function to scrape all bus details
def scrape_all_bus_details(driver):
    for _,row in df.iterrows():
        state = row['state_names']
        route = row['routes']
        link = row['links']

        try:
            driver.get(link)
            
            if check_oops_message(driver):
                continue

            driver.maximize_window()
            view_buses(driver)  #click on button view_buses
            scroll_to_bottom(driver)  #scroll pages dynamically to bottom
            wait = WebDriverWait(driver,5)
            elements_data = wait.until(EC.presence_of_all_elements_located((By.XPATH,xpath_bus_details)))

            if elements_data:
                current_page_data = scrape_current_page(state, route, link, elements_data)
                all_bus_details.extend(current_page_data)
            else:
                print(f"No buses available for state: {state}, route: {route}, link: {link}")    
        except Exception as e:
            print(f"Error on line: state: {state}, route: {route}, link: {link}")
            print(f"Error_details: {e}")         

    driver.quit()

In [15]:
def main_function():
    driver = initialize_chromedriver()
    scrape_all_bus_details(driver)
    final_data = pd.DataFrame(all_bus_details,columns=['state','route','link','busname','bustype','departing_time','duration','reaching_time','price','star_rating','seats_available'])
    final_data.to_csv(f'{directory}/02_big_data.csv', index=False)

In [16]:
def scrape_all_bus_details(driver):
    # Your scraping logic here
    pass  # Replace with actual implementation

def main_function():
    driver = initialize_chromedriver()
    scrape_all_bus_details(driver)
    final_data = pd.DataFrame(all_bus_details, columns=[
        'state', 'route', 'link', 'busname', 'bustype',
        'departing_time', 'duration', 'reaching_time',
        'price', 'star_rating', 'seats_available'
    ])
    # Save final_data to CSV as needed

# Call the main function
redbus_data = main_function()

In [19]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import time
import sys

# Global variable for storing bus details
all_bus_details = []

# Function to initialize the Chrome WebDriver
def initialize_chromedriver():
    try:
        driver = webdriver.Chrome()
        return driver
    except Exception as e:
        print(f"Error initializing WebDriver: {e}")
        sys.exit(1)

# Helper function to check for the "View Buses" button and click it
def view_buses(driver):
    xpath_view_buses = '//div[@class="button" and text()="View Buses"]'
    try:
        wait = WebDriverWait(driver, 5)
        object_viewbuses = wait.until(EC.presence_of_all_elements_located((By.XPATH, xpath_view_buses)))
    except (NoSuchElementException, TimeoutException):
        return
    
    if object_viewbuses:
        for element in reversed(object_viewbuses):
            try:
                time.sleep(2)
                element.click()
                time.sleep(2)
            except Exception:
                continue

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    while True:
        current_page_height = driver.execute_script("return document.body.scrollHeight")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(1)
        currentpage_new_height = driver.execute_script("return document.body.scrollHeight")
        if currentpage_new_height == current_page_height:
            break

# Function to scrape the current page for bus details
def scrape_current_page(state, route, link, elements_data):
    bus_details = []
    for element in elements_data:
        # Extract various details using try-except to handle missing elements
        bus_name = element.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text
        bus_type = element.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text
        departing_time = element.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text
        duration = element.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text
        reaching_time = element.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text
        price = element.find_element(By.XPATH, './/span[contains(@class, "f-19 f-bold") or contains(@class, "f-bold f-19")]').text
        star_rating = element.find_element(By.XPATH, './/div[@class="rating-sec lh-24"]//span').text
        seat_available = element.find_element(By.XPATH, './/div[contains(@class, "seat-left") and (contains(@class, "m-top-16") or contains(@class, "m-top-30"))]').text

        bus_details.append([state, route, link, bus_name, bus_type, departing_time, duration, reaching_time, price, star_rating, seat_available])
    
    return bus_details

# Function to check if there are no buses found
def check_oops_message(driver):
    check_xpath = (
        "//div[(contains(@class, 'oops-wrapper') and //h3[text()='Oops! No buses found.']) or "
        "(@class='oops-wrapper new_oops_wrapper' and text()='Oops! No buses found.')]"
    )
    try:
        wait = WebDriverWait(driver, 3)
        wait.until(EC.presence_of_element_located((By.XPATH, check_xpath)))
        return True
    except (NoSuchElementException, TimeoutException):
        return False

# Main function to scrape all bus details
def scrape_all_bus_details(driver):
    df = pd.read_csv(r'D:\redbus -REDBUS\01_bus_routes.csv')  # Read CSV file
    for _, row in df.iterrows():
        state = row['state_names']
        route = row['routes']
        link = row['links']

        try:
            driver.get(link)
            if check_oops_message(driver):
                continue

            driver.maximize_window()
            view_buses(driver)
            scroll_to_bottom(driver)
            wait = WebDriverWait(driver, 5)
            elements_data = wait.until(EC.presence_of_all_elements_located((By.XPATH, xpath_bus_details)))

            if elements_data:
                current_page_data = scrape_current_page(state, route, link, elements_data)
                all_bus_details.extend(current_page_data)
            else:
                print(f"No buses available for state: {state}, route: {route}, link: {link}")    
        except Exception as e:
            print(f"Error on line: state: {state}, route: {route}, link: {link}")
            print(f"Error_details: {e}")         

# Main function to execute the scraping and save to CSV
def main_function():
    global all_bus_details
    driver = initialize_chromedriver()
    scrape_all_bus_details(driver)
    
    final_data = pd.DataFrame(all_bus_details, columns=[
        'state', 'route', 'link', 'busname', 'bustype',
        'departing_time', 'duration', 'reaching_time',
        'price', 'star_rating', 'seats_available'
    ])
    
    directory = r'D:\redbus -REDBUS'  # Ensure this directory exists
    final_data.to_csv(f'{directory}/02_big_data.csv', index=False)
    return final_data

# Call the main function
if __name__ == "__main__":
    redbus_data = main_function()


Error on line: state: APSRTC, route: Vijayawada to Hyderabad, link: https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad
Error_details: Message: unknown error: failed to change window state to 'normal', current state is 'maximized'
  (Session info: chrome=119.0.6045.160)
Stacktrace:
	GetHandleVerifier [0x00007FF7014482B2+55298]
	(No symbol) [0x00007FF7013B5E02]
	(No symbol) [0x00007FF7012705AB]
	(No symbol) [0x00007FF701252B20]
	(No symbol) [0x00007FF70125139E]
	(No symbol) [0x00007FF7012509B6]
	(No symbol) [0x00007FF7012FE2E9]
	(No symbol) [0x00007FF7012D20AA]
	(No symbol) [0x00007FF7012EAAA4]
	(No symbol) [0x00007FF7012D1E83]
	(No symbol) [0x00007FF7012A670A]
	(No symbol) [0x00007FF7012A7964]
	GetHandleVerifier [0x00007FF7017C0AAB+3694587]
	GetHandleVerifier [0x00007FF70181728E+4048862]
	GetHandleVerifier [0x00007FF70180F173+4015811]
	GetHandleVerifier [0x00007FF7014E47D6+695590]
	(No symbol) [0x00007FF7013C0CE8]
	(No symbol) [0x00007FF7013BCF34]
	(No symbol) [0x00007FF7013BD062