In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging
from collections import defaultdict

In [2]:
# Scrape airling reviews

# Configure logging to track scraping progress and errors
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class AirlineReviewScraper:
    def __init__(self, airlines_input, page_size=100, page_count=30):
        # User-specified list of airlines to scrape
        self.airlines_input = airlines_input
        self.page_size = page_size  # Number of reviews per page
        self.page_count = page_count  # Number of pages to scrape per airline

        # Mapping airline code (used in URLs) to full airline names
        self.airline_mapping = {
            'singapore-airlines': 'Singapore Airlines',
            'qatar-airways': 'Qatar Airways',
            'ana-all-nippon-airways': 'All Nippon Airways',
            'emirates': 'Emirates',
            'japan-airlines': 'Japan Airlines',
            'turkish-airlines': 'Turkish Airlines',
            'air-france': 'Air France',
            'cathay-pacific-airways': 'Cathay Pacific Airways',
            'eva-air': 'EVA Air',
            'swiss-international-air-lines': 'Swiss International Air Lines'
        }

        # Primary data dictionary for review attributes
        self.data = {
            'Airline': [],
            'Verified': [],
            'Reviews': [],
            'Type of Traveller': [],
            'Date Flown': [],
            'Class': [],
            'Overall Rating': [],
            'Recommended': []
        }

        # Ratings for specific features stored separately
        self.ratings_data = defaultdict(list)
        self.features = ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages',
                          'Inflight Entertainment', 'Ground Service', 'Value For Money']

        # Default fill values to handle incomplete records
        self.fill_defaults = {
            'Overall Rating': np.nan,
            'Verified': None,
            'Type of Traveller': None,
            'Class': None,
            'Date Flown': None,
            'Recommended': None
        }

    # Helper to fill lists to a target length with default values
    def fill_with_value(self, lst, length, value=np.nan):
        lst.extend([value] * (length - len(lst)))

    # Extract review metadata from the HTML soup for each individual review
    def extract_review_details(self, soup):
        rows = soup.find_all('tr')
        for row in rows:
            header = row.find('td', class_='review-rating-header')
            value = row.find('td', class_='review-value')
            star = row.find('td', class_='review-rating-stars')

            if header and value:
                header_text = header.text.strip()
                value_text = value.text.strip()

                # Skip placeholder data
                if header_text == 'Date Flown' and value_text == 'Dec-24':
                    continue

                # Assign values to corresponding review fields
                if header_text == 'Type Of Traveller':
                    self.data['Type of Traveller'].append(value_text)
                elif header_text == 'Seat Type':
                    self.data['Class'].append(value_text)
                elif header_text == 'Date Flown':
                    self.data['Date Flown'].append(value_text)
                elif header_text == 'Recommended':
                    val = value_text.strip().lower()
                    if val in {'yes', 'recommended'}:
                        self.data['Recommended'].append(True)
                    elif val in {'no', 'not recommended'}:
                        self.data['Recommended'].append(False)
                    else:
                        self.data['Recommended'].append(None)

            # Count filled stars to determine ratings
            if header and star:
                self.ratings_data[header.text.strip()].append(
                    len(star.find_all('span', class_='star fill')))

    # Scrape one airline by looping through review pages
    def scrape_airline(self, airline_code):
        website = f'https://www.airlinequality.com/airline-reviews/{airline_code}'
        for i in range(1, self.page_count + 1):
            try:
                logging.info(f"Scraping data from {airline_code} Page {i}")
                url = f"{website}/page/{i}/?sortby=post_date%3ADesc&pagesize={self.page_size}"
                response = requests.get(url, timeout=10)
                response.raise_for_status()  # Raise exception for failed requests
                soup = BeautifulSoup(response.content, 'html.parser')

                # Extract overall rating from review cards
                for rating in soup.find_all('div', {'itemprop': 'reviewRating'}):
                    rating_value = rating.find("span", {"itemprop": "ratingValue"})
                    if rating_value:
                        try:
                            self.data['Overall Rating'].append(int(rating_value.get_text()))
                        except ValueError:
                            self.data['Overall Rating'].append(np.nan)

                # Verified status and review content
                for para in soup.find_all("em"):
                    self.data['Verified'].append(para.get_text())
                for para in soup.find_all("div", {"class": "text_content"}):
                    self.data['Reviews'].append(para.get_text())
                    self.data['Airline'].append(self.airline_mapping.get(airline_code))

                # Detailed review information (e.g. seat comfort, date flown)
                self.extract_review_details(soup)

            except requests.exceptions.RequestException as e:
                logging.error(f"Failed to scrape {airline_code} Page {i}: {e}")

        logging.info(f"Total records scraped for {airline_code}: {len(self.data['Reviews'])}")

    # Ensure all lists are equal in length before saving
    def equalize_lengths(self, target_length):
        for key in self.data:
            self.fill_with_value(self.data[key], target_length, self.fill_defaults.get(key, np.nan))
        for feat_name in self.features:
            self.fill_with_value(self.ratings_data[feat_name], target_length, np.nan)

    # Convert scraped data to DataFrame and save as CSV
    def save_to_csv(self, filename='airlines_reviews.csv'):
        df = pd.DataFrame({
            **self.data,
            'Seat Comfort': self.ratings_data['Seat Comfort'],
            'Staff Service': self.ratings_data['Cabin Staff Service'],
            'Ground Service': self.ratings_data['Ground Service'],
            'Food & Beverages': self.ratings_data['Food & Beverages'],
            'Inflight Entertainment': self.ratings_data['Inflight Entertainment'],
            'Value For Money': self.ratings_data['Value For Money']
        })

        # Clean up and standardize values
        df['Reviews'] = df['Reviews'].replace('None', np.nan)
        df.dropna(subset=['Reviews', 'Overall Rating', 'Recommended'], inplace=True)
        df['Verified'] = df['Verified'].replace({'Trip Verified': True, 'Not Verified': False})
        df['Overall Rating'] = pd.to_numeric(df['Overall Rating'], errors='coerce')

        df.to_csv(filename, index=False)
        logging.info(f"Data saved to '{filename}'")

    # Orchestrates the full scraping process across all airlines
    def run(self):
        for airline_code in self.airlines_input:
            self.scrape_airline(airline_code)
            self.equalize_lengths(len(self.data['Reviews']))
        self.save_to_csv()

# Entrypoint for the script
if __name__ == "__main__":
    airlines = [
        'singapore-airlines', 'qatar-airways', 'ana-all-nippon-airways', 'emirates',
        'japan-airlines', 'turkish-airlines', 'air-france', 'cathay-pacific-airways',
        'eva-air', 'swiss-international-air-lines'
    ]
    scraper = AirlineReviewScraper(airlines)
    scraper.run()


2025-08-05 16:14:58,781 - INFO - Scraping data from singapore-airlines Page 1
2025-08-05 16:15:01,336 - INFO - Scraping data from singapore-airlines Page 2
2025-08-05 16:15:03,301 - INFO - Scraping data from singapore-airlines Page 3
2025-08-05 16:15:05,326 - INFO - Scraping data from singapore-airlines Page 4
2025-08-05 16:15:06,952 - INFO - Scraping data from singapore-airlines Page 5
2025-08-05 16:15:08,455 - INFO - Scraping data from singapore-airlines Page 6
2025-08-05 16:15:09,865 - INFO - Scraping data from singapore-airlines Page 7
2025-08-05 16:15:11,625 - INFO - Scraping data from singapore-airlines Page 8
2025-08-05 16:15:13,562 - INFO - Scraping data from singapore-airlines Page 9
2025-08-05 16:15:15,148 - INFO - Scraping data from singapore-airlines Page 10
2025-08-05 16:15:16,872 - INFO - Scraping data from singapore-airlines Page 11
2025-08-05 16:15:18,428 - INFO - Scraping data from singapore-airlines Page 12
2025-08-05 16:15:20,064 - INFO - Scraping data from singapore