In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the function to scrape data from a given URL
def scrape_reviews(page_number):
    base_url = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/?sortby=post_date%3ADesc&pagesize=100"
    url = base_url.format(page_number)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract review titles
    comments = soup.find_all("h2", class_="text_header")
    comment_list = [i.text.strip().replace('"', '').replace('“', '').replace('”', '') for i in comments]

    # Extract reviewer names
    reviewer_names = soup.find_all("span", itemprop="name")
    reviewer_name_list = [i.text.strip() for i in reviewer_names]

    # Extract reviewer locations
    h3_tags = soup.find_all('h3', class_='text_sub_header userStatusWrapper')
    location_list = [h3_tag.text.split('(')[-1].split(')')[0] for h3_tag in h3_tags]

    # Extract dates of reviews
    date_of_reviews = soup.find_all("time", itemprop="datePublished")
    date_of_review_list = [y.text.strip() for y in date_of_reviews]

    # Extract trip verification details
    trip_verifications = soup.find_all("em")
    trip_verification_list = [z.text.strip() for z in trip_verifications]

    # Extract review contents
    reviewer_contents = soup.find_all(class_="text_content")
    reviewer_content_list = [div.text.split('|', 1)[-1].strip() for div in reviewer_contents]

    # Extract "Type of Traveller"
    rows = soup.find_all('tr')
    type_of_traveller_list = []
    for row in rows:
        header = row.find('td', class_='review-rating-header type_of_traveller')
        value = row.find('td', class_='review-value')
        if header and value:
            type_of_traveller_list.append(value.text.strip())

    # Extract "Seat Type"
    seat_type_list = []
    for row in rows:
        header = row.find('td', class_='review-rating-header cabin_flown')
        value = row.find('td', class_='review-value')
        if header and value:
            seat_type_list.append(value.text.strip())

    # Extract "Date Flown"
    date_flown_list = []
    for row in rows:
        header = row.find('td', class_='review-rating-header date_flown')
        value = row.find('td', class_='review-value')
        if header and value:
            date_flown_list.append(value.text.strip())

    # Extract "Recommendation"
    recommendation_list = []
    for row in rows:
        header = row.find('td', class_='review-rating-header recommended')
        value = row.find('td', class_='review-value')
        if header and value:
            recommendation_list.append(value.text.strip())
    
    # Combine all data into a list of dictionaries
    reviews = []
    for i in range(len(comment_list)):  # Use the length of the comments list
        reviews.append({
            "Comment": comment_list[i],
            "Reviewer Name": reviewer_name_list[i] if i < len(reviewer_name_list) else None,
            "Location": location_list[i] if i < len(location_list) else None,
            "Date of Review": date_of_review_list[i] if i < len(date_of_review_list) else None,
            "Trip Verification": trip_verification_list[i] if i < len(trip_verification_list) else None,
            "Review Content": reviewer_content_list[i] if i < len(reviewer_content_list) else None,
            "Type of Traveller": type_of_traveller_list[i] if i < len(type_of_traveller_list) else None,
            "Seat Type": seat_type_list[i] if i < len(seat_type_list) else None,
            "Date Flown": date_flown_list[i] if i < len(date_flown_list) else None,
            "Recommendation": recommendation_list[i] if i < len(recommendation_list) else None
        })
    
    return reviews

# Loop through pages and collect data
all_reviews = []
for page in range(1, 41):
    print(f"Scraping page {page}...")
    all_reviews.extend(scrape_reviews(page))  # Add reviews from each page to the list

# Create a DataFrame
df = pd.DataFrame(all_reviews)

# Save to a CSV file (optional)
#df.to_csv("british_airways_reviews.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
                                Comment           Reviewer Name  \
0                   a national disgrace          Scott Fletcher   
1  sheer neglect in keeping us informed           William Cowie   
2       

In [2]:
df

Unnamed: 0,Comment,Reviewer Name,Location,Date of Review,Trip Verification,Review Content,Type of Traveller,Seat Type,Date Flown,Recommendation
0,a national disgrace,Scott Fletcher,United Kingdom,17th December 2024,Trip Verified,On a recent flight from Cyprus BA621 on 23/11/...,Family Leisure,Economy Class,November 2024,no
1,sheer neglect in keeping us informed,William Cowie,Australia,17th December 2024,Trip Verified,Flight BA 0560 arrived in Rome on 11 December ...,Couple Leisure,Economy Class,December 2024,no
2,a huge disappointment,Danilo Queiroz Palermo,United States,14th December 2024,Trip Verified,This was the first time I flew British Airways...,Business,Business Class,December 2024,no
3,we had to bus to the aircraft,J Meares,United Kingdom,13th December 2024,Trip Verified,Pretty good flight but still some small things...,Solo Leisure,Business Class,December 2024,yes
4,victims of their new supper service,P Gough,United Kingdom,12th December 2024,Trip Verified,"Check in was fine, but no priority/fast track ...",Solo Leisure,First Class,November 2024,yes
...,...,...,...,...,...,...,...,...,...,...
3898,British Airways customer review,C Mcculloch,United Kingdom,29th August 2012,,Flight from Heathrow to Toronto. Booked emerge...,,Economy Class,,no
3899,British Airways customer review,Nick Berry,United Kingdom,28th August 2012,,LHR to HAM. Purser addresses all club passenge...,,Business Class,,yes
3900,British Airways customer review,Avril Barclay,United Kingdom,12th October 2011,,My son who had worked for British Airways urge...,,Economy Class,,yes
3901,British Airways customer review,C Volz,United States,11th October 2011,,London City-New York JFK via Shannon on A318 b...,,Premium Economy,,no


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3903 entries, 0 to 3902
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Comment            3903 non-null   object
 1   Reviewer Name      3903 non-null   object
 2   Location           3903 non-null   object
 3   Date of Review     3903 non-null   object
 4   Trip Verification  1589 non-null   object
 5   Review Content     3903 non-null   object
 6   Type of Traveller  3132 non-null   object
 7   Seat Type          3901 non-null   object
 8   Date Flown         3125 non-null   object
 9   Recommendation     3903 non-null   object
dtypes: object(10)
memory usage: 305.1+ KB


In [4]:
df.isnull().sum()

Comment                 0
Reviewer Name           0
Location                0
Date of Review          0
Trip Verification    2314
Review Content          0
Type of Traveller     771
Seat Type               2
Date Flown            778
Recommendation          0
dtype: int64

## 1. CLEANING DATASET

In [5]:
df.columns

Index(['Comment', 'Reviewer Name', 'Location', 'Date of Review',
       'Trip Verification', 'Review Content', 'Type of Traveller', 'Seat Type',
       'Date Flown', 'Recommendation'],
      dtype='object')

In [6]:
print(f"df shape = {df.shape}")

df shape = (3903, 10)


In [7]:
df.duplicated().sum() #Nine duplicates present

4

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download necessary resources
nltk.download('vader_lexicon')

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis
df['Sentiment'] = df['Review Content'].apply(lambda text: sia.polarity_scores(text)['compound'])
print(df.head())

[nltk_data] Downloading package vader_lexicon to C:\Users\Alimi
[nltk_data]     Nimotalahi\AppData\Roaming\nltk_data...


                                Comment           Reviewer Name  \
0                   a national disgrace          Scott Fletcher   
1  sheer neglect in keeping us informed           William Cowie   
2                 a huge disappointment  Danilo Queiroz Palermo   
3         we had to bus to the aircraft                J Meares   
4   victims of their new supper service                 P Gough   

         Location      Date of Review Trip Verification  \
0  United Kingdom  17th December 2024     Trip Verified   
1       Australia  17th December 2024     Trip Verified   
2   United States  14th December 2024     Trip Verified   
3  United Kingdom  13th December 2024     Trip Verified   
4  United Kingdom  12th December 2024     Trip Verified   

                                      Review Content Type of Traveller  \
0  On a recent flight from Cyprus BA621 on 23/11/...    Family Leisure   
1  Flight BA 0560 arrived in Rome on 11 December ...    Couple Leisure   
2  This was the firs

In [9]:
df

Unnamed: 0,Comment,Reviewer Name,Location,Date of Review,Trip Verification,Review Content,Type of Traveller,Seat Type,Date Flown,Recommendation,Sentiment
0,a national disgrace,Scott Fletcher,United Kingdom,17th December 2024,Trip Verified,On a recent flight from Cyprus BA621 on 23/11/...,Family Leisure,Economy Class,November 2024,no,-0.6059
1,sheer neglect in keeping us informed,William Cowie,Australia,17th December 2024,Trip Verified,Flight BA 0560 arrived in Rome on 11 December ...,Couple Leisure,Economy Class,December 2024,no,-0.8957
2,a huge disappointment,Danilo Queiroz Palermo,United States,14th December 2024,Trip Verified,This was the first time I flew British Airways...,Business,Business Class,December 2024,no,-0.9716
3,we had to bus to the aircraft,J Meares,United Kingdom,13th December 2024,Trip Verified,Pretty good flight but still some small things...,Solo Leisure,Business Class,December 2024,yes,0.9972
4,victims of their new supper service,P Gough,United Kingdom,12th December 2024,Trip Verified,"Check in was fine, but no priority/fast track ...",Solo Leisure,First Class,November 2024,yes,0.9808
...,...,...,...,...,...,...,...,...,...,...,...
3898,British Airways customer review,C Mcculloch,United Kingdom,29th August 2012,,Flight from Heathrow to Toronto. Booked emerge...,,Economy Class,,no,-0.6562
3899,British Airways customer review,Nick Berry,United Kingdom,28th August 2012,,LHR to HAM. Purser addresses all club passenge...,,Business Class,,yes,0.8720
3900,British Airways customer review,Avril Barclay,United Kingdom,12th October 2011,,My son who had worked for British Airways urge...,,Economy Class,,yes,0.4516
3901,British Airways customer review,C Volz,United States,11th October 2011,,London City-New York JFK via Shannon on A318 b...,,Premium Economy,,no,0.9148


In [10]:
# Save to a CSV file (optional)
df.to_csv("british_airways_reviews_final.csv", index=False)