# WDB - Web scraping

- Rami Tarabishi

In [82]:
import pandas as pd
import time
import re

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
route_df = pd.DataFrame(columns=['outgoing_code', 'incoming_code'])
price_df = pd.DataFrame(columns=['outgoing', 'incoming', 'stops', 'duration', 'price', 'airline'])

In [3]:
chrome_options = Options()

# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-images")

In [4]:
# Get top N airports that I wanna scrape
# Get all outgoing flight routes from: https://www.flightsfrom.com/
# Set dates for the flight
# Scrape the prices for the flights
# Save the data in a csv file
# EDA on the data

In [36]:
# Launch the browser
driver = webdriver.Chrome(options=chrome_options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, timeout=10)

actions = ActionChains(driver)

## Getting air routes:

For testing/WDB purposes, Im going to limit myself to routes from zurich cause I believe there will be enough data. (~200 non-stop routes from flightsfrom * multiple different flights to get there with different airlines etc * the date range for prices) 

In [67]:
# Scrape top N airports and their connections
driver.get('https://www.flightsfrom.com/top-100-airports')

# Wait for table to be loaded
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'hometoplist')))

# Run through the airports
try:
    # Get the rows of the table
    rows = driver.find_elements(By.CLASS_NAME, 'hometoplist-item')

    # Since im leaving the main page after I get all the rows, the elements become stale so I save all the urls before hand
    urls = []
    for row in rows:
        urls.append(row.get_attribute('href'))

    # Go to each airport
    for i in range(len(urls)):
    # for row in rows:
        url = urls[i]
        # Set current outgoing airport
        outgoing_code = url[-3:]

        # Go to the airport page
        driver.get(url)

        # Wait for the table to be loaded
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'uk-list')))

        # Since some rows are hidden scroll and click to the show all button
        try:
            show_all_button = driver.find_element(By.CLASS_NAME, 'ff-show-all')
            banner = driver.find_element(By.ID, 'header')
            # Top banner is in the way so account for the banner height
            driver.execute_script(f'window.scroll(0, {show_all_button.location["y"] - banner.size["height"]})')
            show_all_button.click()
        except:
            pass

        # Get the outbound routes
        routes = driver.find_elements(By.CLASS_NAME, 'ff-li-list')

        # Go through each route and save it to the dataframe
        data_list = []
        for route in routes:
            incoming_code = route.find_element(By.CLASS_NAME, 'ff-row-name').text[:3]
            data_list.append({'outgoing_code': outgoing_code, 'incoming_code': incoming_code})

        data = pd.DataFrame.from_dict(data_list)
        route_df = pd.concat([route_df, data], ignore_index=True)
        # Wait for a bit for rate limits
        time.sleep(2)
except TimeoutException:
    print('TimeoutException')

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF685701502+60802]
	(No symbol) [0x00007FF68567AC02]
	(No symbol) [0x00007FF685537CE4]
	(No symbol) [0x00007FF685586D4D]
	(No symbol) [0x00007FF685586E1C]
	(No symbol) [0x00007FF6855CCE37]
	(No symbol) [0x00007FF6855AABBF]
	(No symbol) [0x00007FF6855CA224]
	(No symbol) [0x00007FF6855AA923]
	(No symbol) [0x00007FF685578FEC]
	(No symbol) [0x00007FF685579C21]
	GetHandleVerifier [0x00007FF685A0411D+3217821]
	GetHandleVerifier [0x00007FF685A460B7+3488055]
	GetHandleVerifier [0x00007FF685A3F03F+3459263]
	GetHandleVerifier [0x00007FF6857BB846+823494]
	(No symbol) [0x00007FF685685F9F]
	(No symbol) [0x00007FF685680EC4]
	(No symbol) [0x00007FF685681052]
	(No symbol) [0x00007FF6856718A4]
	BaseThreadInitThunk [0x00007FFC6BFF257D+29]
	RtlUserThreadStart [0x00007FFC6D5CAA48+40]


## Getting flight details and prices:

In [None]:
def parse_flight_text(text: str, dates: tuple = (None, None)) -> dict:
    '''
    A function to parse the text of a flight element
    -----
    Input:
        text: The text of the flight element
        dates (Optional): A tuple with the dates of the flight (departure, return)
    -----
    Returns:
        A dictionary with the parsed data
    '''
    # Split the text into lines
    lines = text.split('\n')

    # Get the outgoing and incoming airports
    outgoing = lines[5][:3]
    incoming = lines[5][-3:]

    # Get the stops
    if lines[6] == 'Nonstop':
        stops = 0
    else:
        stops = int(lines[6].split(' ')[0])

    # Get the duration
    duration = re.findall(r'\d+', lines[4])

    # Get the price
    price = int(re.findall(r'\d+', lines[9])[0])

    # Get the airline
    airline = lines[3]

    return {
        'outgoing': outgoing,
        'incoming': incoming,
        'stops': stops,
        'duration': duration,
        'price': price,
        'airline': airline,
        'departure_date': dates[0],
        'return_date': dates[1]
    }

In [38]:
driver.get('https://www.google.com/travel/flights')

In [27]:
inputs = driver.find_elements(By.TAG_NAME, 'input')

for input_field in inputs:
    # print(input_field.get_attribute('aria-label'))
    if input_field.get_attribute('aria-label') is not None:
        match input_field.get_attribute('aria-label'):
            case 'Where from?':
                input_field.clear()
                input_field.send_keys('ZRH')
                # Pick the first suggestion from the dropdown
                list_elements = driver.find_elements(By.TAG_NAME, 'li')
                for li in list_elements:
                    if li.get_attribute('role') == 'option':
                        time.sleep(1)
                        li.click()
                        break
                time.sleep(2)
            case 'Where to?':
                input_field.send_keys('LHR')
                # Pick the first suggestion from the dropdown
                list_elements = driver.find_elements(By.TAG_NAME, 'li')
                for li in list_elements:
                    if li.get_attribute('role') == 'option':
                        time.sleep(1)
                        li.click()
                        break
                time.sleep(2)
            case 'Departure':
                input_field.send_keys('2024-05-01')
                time.sleep(1)
                input_field.send_keys(Keys.RETURN)
                time.sleep(2)
            case 'Return':
                input_field.send_keys('2024-05-08')
                time.sleep(1)
                input_field.send_keys(Keys.RETURN)
                time.sleep(2)

# Get out of the date picker by tabbing twice, Conveniently this puts us on the search button so we can just press enter after
actions.send_keys(Keys.TAB).perform()
time.sleep(1)
actions.send_keys(Keys.TAB).perform()
time.sleep(1)
actions.send_keys(Keys.RETURN).perform()

# Wait for the page to load by waiting for an H2 that says "Search results"
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'h2')))

# Get flight data, all flights are listed in 2 ULs, both of them are conveniently the last 2 ULs
# Get all UL elements
ULs = driver.find_elements(By.TAG_NAME, 'ul')

best_departing = ULs[-2]

# Get the list of flights
flights = best_departing.find_elements(By.TAG_NAME, 'li')

# Go through each flight and save it to the dataframe
data_list = []
for flight in flights:
    data_list.append(parse_flight_text(flight.text))

data = pd.DataFrame.from_dict(data_list)
price_df = pd.concat([price_df, data], ignore_index=True)

other_flights = ULs[-1]

# Get the list of flights
flights = other_flights.find_elements(By.TAG_NAME, 'li')

# Go through each flight and save it to the dataframe
data_list = []
for i in range(10):
    flight = flights[i]
    print(flight.text)
    print("")

data = pd.DataFrame.from_dict(data_list)
price_df = pd.concat([price_df, data], ignore_index=True)

In [107]:
driver.quit()