In [1]:
import requests
from bs4 import BeautifulSoup

# Constants
BASE_URL = "https://www.airlinequality.com/airline-reviews/british-airways"
PAGES = 1
PAGE_SIZE = 2
REMOVE_TEXT = ['✅ Trip Verified', 'Not Verified |']

# Function to get data from a user review based on header passed
def get_data_from_review(review, header):
    review_section = review.find('tr', {'class': 'review-row'})
    if review_section:
        value_td = review_section.find('td', {'class': 'review-value'})
        if value_td:
            if 'stars' in value_td.get('class', []):
                return len(value_td.find_all('span', {'class': 'star fill'}))
            else:
                return value_td.get_text(strip=True)
    return None

# Function to parse and extract data from a page
def parse_page(url):
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize containers for each table
    airlines_data = []
    aircraft_data = []
    travellers_data = []
    seats_data = []
    reviews_data = []

    for review_section in soup.find_all("article", {"itemprop": "review"}):
        airline = {'airline_name': 'British Airways'}  # Assuming British Airways for simplicity
        aircraft = {'aircraft': None}  # Assuming there is a way to extract this information
        traveller = {'type_of_traveller': None,'route': None, 'date_flown': None}  # Extracted from review_data
        seat = {'seat_type': None}  # Assuming there is a way to extract this information
        review = {
            'seat_comfort': None,
            'cabin_staff_service': None,
            'food_&_beverages': None,
            'Inflight_Entertainment': None,
            'ground_service': None,
            'wifi_&_connectivity': None,
            'value_for_money': None,
            'recommended': None,
            'review_text': None,
            'review_rating': None
        }

        # Extracting review rating from review
        review_rating = review_section.find('span', itemprop='ratingValue')
        if review_rating:
            review['review_rating'] = review_rating.get_text(strip=True)

        # Extracting review text from review
        review_text = review_section.find('div', class_='text_content')
        if review_text:
            review['review_text'] = review_text.get_text(strip=True)

        # Populate the review object with data
        for tr in review_section.find_all('tr'):
            header = tr.find('td', class_='review-rating-header').get_text(strip=True).lower().replace(" ", "_")
            value_td = tr.find('td', class_='review-value')
            star_td = tr.find('td', class_='review-rating-stars')
            if star_td and 'stars' in star_td.get('class', []):
                # Extracting ratings from review
                review[header] = len(star_td.find_all('span', {'class': 'star fill'}))
            elif value_td:
                review[header] = value_td.get_text(strip=True)
            # Extracting aircraft from review
            if header == 'aircraft':
                aircraft['aircraft'] = value_td.get_text(strip=True)
            # Extracting route from review
            elif header == 'route':
                traveller['route'] = value_td.get_text(strip=True)
            # Extracting date flown from review
            elif header == 'date_flown':
                traveller['date_flown'] = value_td.get_text(strip=True)
            # Extracting traveller type from review
            elif header == 'type_of_traveller':
                traveller['type_of_traveller'] = value_td.get_text(strip=True)
            # Extracting seat type from review
            elif header == 'seat_type':
                seat['seat_type'] = value_td.get_text(strip=True)
            # Extracting review text from review

                
            else:
                recommended_td = tr.find('td', class_='review-rating-stars')
                if recommended_td:
                    review['recommended'] = len(recommended_td.find_all('span', {'class': 'star fill'}))

        # Append to respective lists
        airlines_data.append(airline)
        aircraft_data.append(aircraft)
        travellers_data.append(traveller)
        seats_data.append(seat)
        reviews_data.append(review)


    return airlines_data, aircraft_data, travellers_data, seats_data, reviews_data

# Scraping reviews
all_airlines = []
all_aircraft = []
all_travellers = []
all_seats = []
all_reviews = []
for i in range(1, PAGES + 1):
    url = f"{BASE_URL}/page/{i}/?sortby=post_date%3ADesc&pagesize={PAGE_SIZE}"
    airlines, aircraft, travellers, seats, reviews = parse_page(url)
    all_airlines.extend(airlines)
    all_aircraft.extend(aircraft)
    all_travellers.extend(travellers)
    all_seats.extend(seats)
    all_reviews.extend(reviews)
print(all_reviews)

# Now all_airlines, all_aircraft, all_flights, all_travellers, all_seats, and all_reviews contain the extracted data


https://www.airlinequality.com/airline-reviews/british-airways/page/1/?sortby=post_date%3ADesc&pagesize=2
[{'seat_comfort': 1, 'cabin_staff_service': 4, 'food_&_beverages': 1, 'Inflight_Entertainment': None, 'ground_service': 3, 'wifi_&_connectivity': None, 'value_for_money': 4, 'recommended': 'no', 'review_text': '✅Trip Verified| Not a great experience. I could not check in online as two separate parts of the BA computer system do not talk to each other. At LHR a delightful check in clerk sorted it quickly, manually! We boarded on time, but luggage loading had to be halted as the Captain spotted a technical issue and an engineer was called. By the time it was sorted we had missed our slot so took off an hour behind schedule. The flight was uneventful, but the plane had clearly seen better days, legroom was appalling and the handout of a small bottle of water and a soggy cookie made one wish for the days when flying BA meant something special. Next time it will be Jet2.', 'review_ratin