In [9]:
from time import sleep ,strftime
from datetime import datetime, timedelta
from random import randint
import pandas as pd
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys

In [10]:
chrome_driver_path = "C:\\{}\\{}\\{}\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"
service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service)
sleep(2)

In [11]:
def load_more():
    """Click 'Load More' button if it exists."""
    try:
        more_results = '//div[@class = "ULvh-button show-more-button"]'
        driver.find_element(By.XPATH, more_results).click()
        print('Loading more results...')
        sleep(randint(5,8))
    except:
        print('No more results to load.')
        pass

In [12]:
def start_kayak(city_from, city_to, start_date,threshold_price):
    """Fetch data for 7 consecutive days starting from start_date."""
    final_df = pd.DataFrame()
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    
    for i in range(1):
        current_date = start_date + timedelta(days=i)
        date_str = current_date.strftime("%Y-%m-%d")
        print(f"Scraping data for {date_str}...")
        
        kayak_url = f'https://www.kayak.com/flights/{city_from}-{city_to}/{date_str}-flexible?sort=bestflight_a'
        driver.get(kayak_url)
        sleep(randint(8,12))

        # Close popup if exists
        try:
            pop_button = driver.find_element(By.XPATH, '//div[contains(@class,"bBPb-close")]')
            pop_button.click()
        except:
            pass

        sleep(randint(15,20))

        print("Scraping Best flights...")
        df_best = page_scrape(threshold_price)
        df_best['Sort'] = 'Best'
        #df_best['Date'] = date_str  # Add the formatted date

        # Switch to Cheapest
        price_sort = driver.find_element(By.XPATH, '//div[@data-content="price_a"]')
        price_sort.click()
        sleep(randint(10,15))
        print("Scraping Cheapest flights...")
        df_cheap = page_scrape(threshold_price)
        df_cheap['Sort'] = 'Cheapest'
        #df_cheap['Date'] = date_str  # Add the formatted date

        # Switch to Quickest
        duration_sort = driver.find_element(By.XPATH, '//div[@data-content="duration_a"]')
        duration_sort.click()
        sleep(randint(10,15))
        print("Scraping Quickest flights...")
        df_fast = page_scrape(threshold_price)
        df_fast['Sort'] = 'Quickest'
        #df_fast['Date'] = date_str  # Add the formatted date

        # Combine all for the current date
        current_df = pd.concat([df_best, df_cheap, df_fast], ignore_index=True)
        final_df = pd.concat([final_df, current_df], ignore_index=True)

    print("✅ Scraping complete.")
    return final_df

In [13]:
def page_scrape(threshold_price):
    """Scrape one page of flights into a DataFrame"""
    
    sleep(randint(2,4))
    xp_sections = '//div[@class="xdW8"]'
    sections = driver.find_elements(By.XPATH, xp_sections)
    if not sections:
        raise Exception("Captcha detected or no flights found.")
    
    sections_list = [sec.text for sec in sections]
    
    durations = []
    cities = []
    for sec in sections_list:
        parts = sec.split()
        durations.append(' '.join(parts[0:2]))
        cities.append(' '.join(parts[2:5]))

    xp_prices = '//div[@class="e2GB-price-text"]'
    prices = driver.find_elements(By.XPATH, xp_prices)
    prices_list = [int(price.text.replace("$",'').replace(',','')) for price in prices if price.text]

    xp_stops = '//div[@class="JWEO"]/div[1]'
    stops = driver.find_elements(By.XPATH, xp_stops)
    stops_list = [stop.text[0] if stop.text else '0' for stop in stops]

    xp_dates = '//div[@class="c9L-i"]'
    dates = driver.find_elements(By.XPATH, xp_dates)
    dates_list = [value.text for value in dates]

    days = [value.split()[0] for value in dates_list]
    weekdays = [value.split()[1] for value in dates_list]

    xp_airlines = '//div[@class="VY2U"]'
    airlines = driver.find_elements(By.XPATH, xp_airlines)
    airline_list = []
    times_list = []
    for air in airlines:
        split = air.text.split('\n')
        if len(split) >= 2:
            times_list.append(split[0])
            airline_list.append(split[1])
    
    # Build dataframe
    df = pd.DataFrame({
        'Date': days,
        'Weekdays': weekdays,
        'Duration': durations,
        'Cities': cities,
        'Price($)': prices_list,
        'Stops': stops_list,
        'Time': times_list,
        'Airline': airline_list,
        #'Timestamp': strftime("%Y-%m-%d %H:%M")
    })
    df = df[df['Price($)'] <= threshold_price]
    
    return df

In [None]:
def send_email_with_attachment(subject, body, to_email, excel_filename):
    from_email = "sender@gmail.com"
    password = "password"  # Use an App Password for Gmail

    msg = MIMEMultipart()
    msg['From'] = from_email
    msg['To'] = to_email
    msg['Subject'] = subject

    msg.attach(MIMEText(body, 'plain'))

    with open(excel_filename, 'rb') as attachment:
        part = MIMEBase('application', 'octet-stream')
        part.set_payload(attachment.read())
        encoders.encode_base64(part)
        part.add_header('Content-Disposition', f'attachment; filename= {excel_filename}')
        msg.attach(part)

    try:
        server = smtplib.SMTP('smtp.gmail.com', 587)
        server.starttls()
        server.login(from_email, password)
        server.send_message(msg)
        print("Email sent successfully!")
    except Exception as e:
        print(f"Failed to send email: {e}")
    finally:
        server.quit()

# Main execution
city_from = 'GWL'
city_to = 'BLR'
start_date = datetime.now().strftime("%Y-%m-%d")  # Today's date
threshold_price=80

df = start_kayak(city_from, city_to, start_date,threshold_price)
print(df.head())

# Save to Excel
excel_filename = "flight_data.csv"
df.to_csv(excel_filename, index=False)
print(f"Data saved to {excel_filename}")

# Send email
subject = "Flight Data Report"
body = "Please find attached the flight data for your requested dates."
to_email = "reciver@gmail.com"

send_email_with_attachment(subject, body, to_email, excel_filename)

Scraping data for 2025-04-28...
Scraping Best flights...
Scraping Cheapest flights...
Scraping Quickest flights...
✅ Scraping complete.
   Date Weekdays Duration     Cities  Price($) Stops               Time  \
0  4/30      Wed   2h 45m  GWL - BLR        74     n  2:20 pm – 5:05 pm   
1  4/29      Tue   2h 45m  GWL - BLR        74     n  2:20 pm – 5:05 pm   
2  4/30      Wed   2h 45m  GWL - BLR        74     n  2:20 pm – 5:05 pm   
3  4/29      Tue   2h 45m  GWL - BLR        74     n  2:20 pm – 5:05 pm   
4  4/30      Wed   2h 45m  GWL - BLR        74     n  2:20 pm – 5:05 pm   

             Airline      Sort  
0  Air India Express      Best  
1  Air India Express      Best  
2  Air India Express  Cheapest  
3  Air India Express  Cheapest  
4  Air India Express  Quickest  
Data saved to flight_data.csv
Failed to send email: (534, b'5.7.9 Application-specific password required. For more information, go to\n5.7.9  https://support.google.com/mail/?p=InvalidSecondFactor d9443c01a7336-22db

In [15]:
driver.quit()
print("Done.")

Done.


In [16]:
df.sort_values('Price($)')

Unnamed: 0,Date,Weekdays,Duration,Cities,Price($),Stops,Time,Airline,Sort
0,4/30,Wed,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Best
1,4/29,Tue,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Best
2,4/30,Wed,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Cheapest
3,4/29,Tue,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Cheapest
4,4/30,Wed,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Quickest
5,4/29,Tue,2h 45m,GWL - BLR,74,n,2:20 pm – 5:05 pm,Air India Express,Quickest
