# Scraping flight data from Kayak website.

### Importing

In [60]:
import re
import requests
from bs4 import BeautifulSoup
import csv

### User Input

Taking the list of sources and destinations from the user.  The loop runs until user decides to stop it and enters -1

In [61]:
# get user input for routes
sources = []
destinations = []
print("Please enter -1 when done.")
print("-"*10)
while True:
    sources.append(input("From which city?\n"))
    if "-1" in sources: 
        sources.pop(-1)
        break
    destinations.append(input("Where to?\n"))
    if "-1" in destinations: 
        sources.pop(-1)
        destinations.pop(-1)
        break
    print("-"*10)

print("\nRoutes:")
for i in range(len(sources)):
    print(f"{sources[i]} => {destinations[i]}")

Please enter -1 when done.
----------
From which city?
NYC
Where to?
DXB
----------
From which city?
-1

Routes:
NYC => DXB


This takes user input for a start and end date in the "YYYY-MM-DD" format using numpy's np.datetime64. It then calculates the number of days between the start and end dates and generates a list of all the dates in between. The resulting date_list contains the dates as strings in the specified format.

In [70]:
import numpy as np

# Get user input for the start and end date
start_date = np.datetime64(input('Start Date, Please use YYYY-MM-DD format only '))
end_date = np.datetime64(input('End Date, Please use YYYY-MM-DD format only '))

# Calculate the number of days between start and end dates
days = end_date - start_date
num_days = days.item().days

# Initialize an empty list to store the dates
date_list = []

# Iterate from start_date to end_date and add each date (as a string) to the list
for i in np.arange(num_days + 1):
    current_date = start_date + np.timedelta64(i, 'D')
    date_str = str(current_date).split("T")[0]  # Format the date as a string in "YYYY-MM-DD" format
    date_list.append(date_str)

# Now, date_list contains all the dates between start_date and end_date as strings
# print(date_list)


Start Date, Please use YYYY-MM-DD format only 2023-11-02
End Date, Please use YYYY-MM-DD format only 2023-11-04


### Scraping Methods

This Method is used to scrape flight operator info from the website. This also takes care of the pop up if present.

In [63]:
def getAirline(flight_element):
     # Check if the pop-up exists (you need to specify the appropriate condition)
    popup_present = driver.find_elements(By.XPATH, 'XPATH_FOR_THE_POPUP_ELEMENT')
    
    if popup_present:
        # If the pop-up is present, locate and click the SVG button
        svg_button = driver.find_element(By.CSS_SELECTOR, 'svg.dDYU-closeIcon.dDYU-mod-theme-default')
        svg_button.click()
        
    # Locate the operator information within the given flight element
    operator_info = []

    # Define a function to extract operator information from an HTML element
    def extract_operator_info(element):
        operator = element.find('div', class_='J0g6-operator-text').text #<div class="J0g6-operator-text">Kuwait Airways</div>
        return operator

    # Use the correct locator (By.CSS_SELECTOR) to find operator elements within the flight element
    operator_elements = flight_element.find_elements(By.CSS_SELECTOR, 'div.J0g6-labels-grp')
    for webelement in operator_elements:
        ele_soup = BeautifulSoup(webelement.get_attribute('outerHTML'), 'html.parser')
        operator_info.append(extract_operator_info(ele_soup))
    
    airline = []

    # Extract the operator information
    for operator in operator_info:
        al = operator.split("•")[0].strip()
        airline.append(al)

    return airline


This function is used to scrape departure time, arrival time and number of stops from the website.

In [64]:
def getTimeAndStopsCount(flight_element):
    departureTime = []
    arrivalTime = []
    stopsList = []

    # Extract and process departure time, arrival time, and stops count
    try:
        time_element = flight_element.find_element(By.CSS_SELECTOR, 'div.vmXl.vmXl-mod-variant-large')
        stops_element = flight_element.find_element(By.CSS_SELECTOR, 'span.JWEO-stops-text')

        time_info = time_element.text
        stops_info = stops_element.text

        # Process departure and arrival time
        time_parts = time_info.split('–')
        if len(time_parts) > 1:
            departureTime.append(time_parts[0].strip())
            arrivalTime.append(time_parts[1].split('+')[0].strip())
        else:
            departureTime.append("")
            arrivalTime.append("")

        # Process stops count
        stops = re.findall(r'\d+', stops_info)
        if stops:
            stopsList.append(stops[0])
        else:
            stopsList.append("0")

    except Exception as e:
        print("Unable to extract time and stops count:", str(e))

    return departureTime, arrivalTime, stopsList


This function is used to scrape total flight time from the website.

In [65]:
def getDuration(flight_element):
    flight_time_info = []

    # Define a function to extract flight time from an HTML element
    def extract_flight_time(element):
        if element.find('div', class_='vmXl vmXl-mod-variant-default') is None:
            flight_time = ""
        else:
            flight_time = element.find('div', class_='vmXl vmXl-mod-variant-default').text
        return flight_time

    # Use the correct locator (By.CSS_SELECTOR) to find flight time elements within the flight_element
    flight_time_elements = flight_element.find_elements(By.CSS_SELECTOR, 'div.xdW8.xdW8-mod-full-airport')
    for webelement in flight_time_elements:
        ele_soup = BeautifulSoup(webelement.get_attribute('outerHTML'), 'html.parser')
        flight_time_info.append(extract_flight_time(ele_soup))
    duration = []
    
    # Print the flight time information
    for time in flight_time_info:
        duration.append(time)
    return duration


This function is used to scrape flight price from the website.

In [66]:
def getPrice(flight_element):
    # Create a list to store flight price information
    flight_price_info = []

    # Define a function to extract flight price from an HTML element
    def extract_flight_price(element):
        price_text = element.find('div', class_='f8F1-price-text').text
        return price_text.replace("C$", "").strip()

    # Use the correct locator (By.CSS_SELECTOR) to find flight price elements within the flight_element
    price_elements = flight_element.find_elements(By.CSS_SELECTOR, 'div.f8F1-price-text-container')
    for webelement in price_elements:
        ele_soup = BeautifulSoup(webelement.get_attribute('outerHTML'), 'html.parser')
        flight_price_info.append(extract_flight_price(ele_soup))

    # Return the list of flight prices for this specific flight element
    return flight_price_info


This function is used to layover time and layover cities from the website.

In [67]:
def getLayovers(flight_element):
    # Check if the pop-up exists (you need to specify the appropriate condition)
    popup_present = driver.find_elements(By.XPATH, 'XPATH_FOR_THE_POPUP_ELEMENT')

    if popup_present:
        # If the pop-up is present, locate and click the SVG button
        svg_button = driver.find_element(By.CSS_SELECTOR, 'svg.dDYU-closeIcon.dDYU-mod-theme-default')
        svg_button.click()

    # Create a list to store layover information
    layover_info = []
    layovertime=[]
    layovercities=[]
    layovers=[]
    # Define a function to extract layover information from an HTML element
    def extract_layover_info(element):
        layover_elements = element.find_all('span', title=True)

        multiLayovertime=''
        multiLayoverCity=''

        if(len(layover_elements) > 1):
            for layover_element in layover_elements:
                layover_text = layover_element['title']
                layover_parts = layover_text.split(', <b>')
                layover_time = layover_parts[0].replace("layover","").strip()
                multiLayovertime += layover_time + ","
                layover_city = layover_parts[1].strip('</b>')
                multiLayoverCity += layover_city + ","
            layovertime.append(multiLayovertime[:-1])
            layovercities.append(multiLayoverCity[:-1])
        else:
            for layover_element in layover_elements:
                layover_text = layover_element['title']
                layover_parts = layover_text.split(', <b>')
                layover_time = layover_parts[0].replace("layover","").strip()
                layover_city = layover_parts[1].strip('</b>')
                layovertime.append(layover_time)
                layovercities.append(layover_city)

    # Use the correct locator (By.CSS_SELECTOR) to find layover elements
    layover_elements = flight_element.find_elements(By.CSS_SELECTOR, 'div.c_cgF.c_cgF-mod-variant-full-airport')
    for webelement in layover_elements:
        ele_soup = BeautifulSoup(webelement.get_attribute('outerHTML'), 'html.parser')
        extract_layover_info(ele_soup)

    layovers.append(layovertime)
    layovers.append(layovercities)

    return layovers

This function is used to scrape baggage details of the flights from the website.

In [68]:
def getBagDetails(flight_element):
    # Check if the pop-up exists (specify the appropriate condition for the pop-up)
    popup_present = driver.find_elements(By.XPATH, 'XPATH_FOR_THE_POPUP_ELEMENT')

    if popup_present:
        # If the pop-up is present, locate and click the SVG button to close it
        svg_button = driver.find_element(By.CSS_SELECTOR, 'svg.dDYU-closeIcon.dDYU-mod-theme-default')
        svg_button.click()

    # Create lists to store baggage information
    carryOnBags = []
    checkedBags = []

    try:
        # Use JavaScript to extract baggage information from the HTML source within the flight_element
        script = """
                    bags = arguments[0].querySelectorAll('div.ac27-inner');
                    myBags=[]
                    myCarryBags =[]
                    myCheckinBags = []
                    isCarryOnBag = true
                    for (let i = 1; i < bags.length; i += 2) {
                        if (isCarryOnBag)
                            myCarryBags.push(bags[i].textContent);
                        else
                            myCheckinBags.push(bags[i].textContent);
                        isCarryOnBag = !isCarryOnBag;
                    }
                    myBags.push(myCarryBags);
                    myBags.push(myCheckinBags);
                    return myBags;
                    """

        baggage_info = driver.execute_script(script, flight_element)

        # Extract the carry-on and checked baggage information
        carryOnBags = baggage_info[0]
        checkedBags = baggage_info[1]

        # Check if baggage information is empty for the flight element
        if not carryOnBags:
            carryOnBags = ["N/A"]
        if not checkedBags:
            checkedBags = ["N/A"]
    except Exception as e:
        print("Unable to extract baggage information:", str(e))
        carryOnBags = ["N/A"]
        checkedBags = ["N/A"]

    return carryOnBags, checkedBags


### Main block of Scrapping
- **Steps involved:**
  - Importing all the required libraries
  - Defing the CSV file
  - `scrape_flight_data_to_csv` contains all the scrapping steps
  - `scroll_and_click_show_more_button` is to load all the data by clicking show more button until its gone
  - The code then appends all the data to the predefined lists
  - And writes all the data into the `.csv` file

In [71]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep

# Define the columns and create the CSV file outside the loop
csv_filename = sources[0] + "_" + destinations[0] +"_NOV.csv"
column_names = ['Date',"Airline", "Duration", "Source", "Destination",
                 "Departure Time", "Arrival Time",
                "Layover Time", "Layover Cities", "Total stops",
                "Carry bags Count", "Checkin bags Count","Price"]

# column_names = ['Date',"Airline", "Duration", "Source", "Destination", "Departure Time", "Arrival Time",
#                 "Layover Time", "Layover Cities", "Total stops", "Carry bags Count", "Checkin bags Count", "Price"]

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(column_names)


# Function to scrape flight data and save it to the CSV file
def scrape_flight_data_to_csv(driver, source, destination, current_date, csv_filename):
    # Construct the URL based on source, destination, and start_date
    url = f"https://www.ca.kayak.com/flights/{source}-{destination}/{current_date}?sort=bestflight_a"
    driver.get(url)

    # Wait for some time (you can adjust this time according to your needs)
    sleep(5)

    # Check if the pop-up exists (you need to specify the appropriate condition)
    popup_present = driver.find_elements(By.XPATH, 'XPATH_FOR_THE_POPUP_ELEMENT')

    if popup_present:
        # If the pop-up is present, locate and click the SVG button
        svg_button = driver.find_element(By.CSS_SELECTOR, 'svg.dDYU-closeIcon.dDYU-mod-theme-default')
        svg_button.click()

    # Define a function to scroll and click the "Show More" button
    def scroll_and_click_show_more_button():
        try:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//div[@class="ULvh-button show-more-button"]'))
            )
            actions = ActionChains(driver)
            actions.move_to_element(show_more_button).perform()
            show_more_button.click()
            return False
        except:
            return True

    # Loop to scroll and click the "Show More" button until it's no longer available
    while True:
        if scroll_and_click_show_more_button():
            break
    # Get all the flight data together in a structured way
    flight_data = []

    # Loop through the flight elements
    flight_elements = driver.find_elements(By.XPATH,'//div[@class="nrc6"]')
    for flight_element in flight_elements:
        airline = getAirline(flight_element)
        duration = getDuration(flight_element)
        price = getPrice(flight_element)
        
        timeAndStops = getTimeAndStopsCount(flight_element)
        departureTimes = timeAndStops[0]
        arrivalTimes = timeAndStops[1]
        stops = timeAndStops[2]
        
        bagDetails = getBagDetails(flight_element)
        carryBags = bagDetails[0]
        checkinBags = bagDetails[1]
        
        layoverDetails = getLayovers(flight_element)
        layoverTimes = layoverDetails[0]
        layoverCities = layoverDetails[1]
        
#         stops = get_stops(flight_element)
        # Determine the maximum length among all lists
        max_length = max(len(airline), len(duration), len(price),
                     len(departureTimes), len(arrivalTimes), len(stops),
                     len(carryBags), len(checkinBags), len(layoverTimes), len(layoverCities))

    # Iterate through the lists and append values to the flight_data list
        for i in range(max_length):
            flight_data.append({
            "Airline": airline[i] if i < len(airline) else "",
            "Duration": duration[i] if i < len(duration) else "",
            "Price": price[i] if i < len(price) else "",
            "carryBags": carryBags[i] if i < len(carryBags) else "",
            "checkinBags": checkinBags[i] if i < len(checkinBags) else "",
            "layoverTimes": layoverTimes[i] if i < len(layoverTimes) else "",
            "layoverCities": layoverCities[i] if i < len(layoverCities) else "",
            "departureTimes": departureTimes[i] if i < len(departureTimes) else "",
            "arrivalTimes": arrivalTimes[i] if i < len(arrivalTimes) else "",
            "stops": stops[i] if i < len(stops) else "" })





    # Write all the flight data to the CSV file
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)

        script = """vals = document.querySelectorAll('div.vvTc-item-value')
        list=[]
        list.push(vals[0].textContent)
        list.push(vals[1].textContent)
        return list"""
        details = driver.execute_script(script)
        src = details[0]
        dest = details[1]

        for flight in flight_data:
            csv_writer.writerow([current_date, flight["Airline"], flight["Duration"], src, dest,
                                 flight['departureTimes'], flight['arrivalTimes'],
                                 flight['layoverTimes'], flight['layoverCities'], flight['stops'],
                                 flight['carryBags'], flight['checkinBags'], flight['Price']])


for source in sources:
    for destination in destinations:
        for current_date in date_list:
            # print(f"https://www.ca.kayak.com/flights/{source}-{destination}/{current_date}?sort=bestflight_a")
            driver = webdriver.Chrome()  # Create a new driver instance for each combination
            scrape_flight_data_to_csv(driver, source, destination, current_date, csv_filename)
            driver.quit()  # Close the browser after scraping data


 - EVERYTHING SAVED INTO CSV
### THE END
