In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import itertools

pd.set_option('display.max_rows', 1000)

In [3]:
class WebScraper:
    def __init__(self, page_number):
        # Getting the webpage and parsing it with lxml.
        self.the_main_link = "https://www.airlinequality.com/airline-reviews/british-airways/page/" + str(page_number) + "/?sortby=post_date%3ADesc&pagesize=100"
        self.webpage_response = requests.get(self.the_main_link)
        self.webpage = self.webpage_response.content
        self.webpage = BeautifulSoup(self.webpage, "lxml")

        # Getting the review stats (made up of nominal and ordinal data (being flight info and stars rating))
        self.review_data = self.webpage.find_all(class_=["review-value", "review-rating-header", "star fill"])
        self.review_data = [i.text for i in self.review_data]
        # Removing the first 20 elements as they are the totals.
        self.review_data = self.review_data[20:]

    # Below we are essentially grouping together all the filled stars, as the list had each star fill as an
    # individual item, so we wanted to group them together to allow us to then get the last filled in star in the list,
    #  being the overall rating for that element.
    def getting_the_stars(self):
        current_group = []
        ready_to_group = []
        for i, value in enumerate(self.review_data):
            if len(self.review_data[i]) == 1:
                current_group.append(int(self.review_data[i]))
            else:
                # Using a try clause as the first iteration wont work, (because of the [-1] index).
                try:
                    ready_to_group.append(current_group[-1])
                    ready_to_group.append(self.review_data[i])
                    current_group = [] # Resetting the that star cluster.
                except:
                    ready_to_group.append(current_group)
                    ready_to_group.append(self.review_data[i])
                    current_group = []

        ready_to_group = [i for i in ready_to_group if i != []] # Removing the empty lists.
        return ready_to_group
    
    # Putting every even index as a key and every odd index as a value in a dictionary
    
    def putting_into_df(self, data):
        keys = ["Aircraft", "Type Of Traveller", "Seat Type", "Route", "Date Flown", 
                "Seat Comfort", "Cabin Staff Service", "Food & Beverages", "Inflight Entertainment", 
                "Ground Service", "Wifi & Connectivity", "Value For Money", "Recommended"]
    # Data frame with the keys as the columns. We put in the keys prior 
    # as it was causing issues when we tried to append. NaN values were getting pushed to the bottom.
        df = pd.DataFrame(columns=keys)
        review_dict = {} # Temporary dictionary to hold the data for each review. 

        # Paring the data into a dictionary. If the key is "Recommended" then we want 
        # to put the dictionary into a dataframe ("recommended" is the last value in each review)
        # this is so that we can have each row as each person's review, rather than it mashed together.

        for i, val in enumerate(data):
            # Getting every even index as the key and odd as the value. 
            if i % 2 == 0:
                key = data[i]
                value = data[i+1]
                if key in review_dict:
                    review_dict[key].append(value)
                elif key not in review_dict:
                    review_dict[key] = [value]
            elif key == "Recommended":
                # Putting the dictionary into a dataframe, using concat as append is deprecated.
                df = pd.concat([df, pd.DataFrame(review_dict)], ignore_index=True)
                review_dict = {}
        return df

    # Returning the dataframe, text and overall rating. 
    def get_reviews(self):
        grouped_together = self.getting_the_stars()
        stats_df = self.putting_into_df(grouped_together)
        text_content = self.webpage.find_all(class_="text_content")
        # Getting the overall rating.
        overall_rating = self.webpage.find_all("span", attrs={"itemprop": "ratingValue"})
        overall_rating = overall_rating[1:]
        overall_rating = [i.text for i in overall_rating]
        return stats_df, text_content, overall_rating

scraped_df = pd.DataFrame()
just_text = []
overall_rating = []
pages = 20 # Getting 20 pages as we start to get missing values after that. It should be enough. 
count = 0
# Looping through the pages and scraping the data from each page.
for page in range(1, pages+1):
    webscrape = WebScraper(page)
    df, reviews, rating = webscrape.get_reviews()
    # Putting the output dataframe into a new variable, so then it can be reseted for the next page, which can then added as well etc. 
    scraped_df = pd.concat([scraped_df, df], ignore_index=True)
    text_content = [i.text for i in reviews]
    text_content = [i.replace("\n", "") for i in text_content]
    just_text.append(text_content)
    overall_rating.append(rating)
    count += 1
    print(f"{count} pages scraped")
    



# The itertools.chain function is used to flatten the list of lists.
removing_embed = list(itertools.chain(*just_text))
scraped_df["Reviews"] = removing_embed
scraped_df["Trip Verified"] = scraped_df["Reviews"].apply(lambda x: True if "Trip Verified" in x or "Verified Review" in x else False)

# Adding the overall rating to the dataframe. 
overall_rating = list(itertools.chain(*overall_rating))
scraped_df["Overall Rating"] = overall_rating


1 pages scraped
2 pages scraped
3 pages scraped
4 pages scraped
5 pages scraped
6 pages scraped
7 pages scraped
8 pages scraped
9 pages scraped
10 pages scraped
11 pages scraped
12 pages scraped
13 pages scraped
14 pages scraped
15 pages scraped
16 pages scraped
17 pages scraped
18 pages scraped
19 pages scraped
20 pages scraped


In [4]:
# scraped_df.to_csv("BA_reviews.csv")