In [1]:
# function and script for getting refreshed data from airline quality/skytrax
# The focus will be on getting recent data however this could be updated to scrape all data

In [2]:
import re #Regular expressions
from bs4 import BeautifulSoup # a python HTML parser
import requests
import string
import pandas as pd

In [3]:
# The first step will be to get a list of all the airlines using the a-z airlines reviews page
url = "http://www.airlinequality.com/review-pages/a-z-airline-reviews/"

In [4]:
r = requests.get(url)
r.status_code

200

200 status means successful. So far so good. 

In [5]:
# Now use beautiful soup to pull in data (make soup)
soup = BeautifulSoup(r.content, 'html.parser')

In [6]:
# find the IDs we are interested in - this will just be the "A" airlines so will need to build loop but let's take a look at A to start
col_groups = soup.find(id="a2z-ldr-B")

In [8]:
col_groups.prettify()

'<div class="content " id="a2z-ldr-B">\n <div class="a_z_col_group">\n  <ul class="items">\n   <li>\n    <a href="/airline-reviews/ba-cityflyer">\n     BA CityFlyer\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/badr-airlines">\n     Badr Airlines\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/bahamasair">\n     Bahamasair\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/bangkok-airways">\n     Bangkok Airways\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/bassaka-air">\n     Bassaka Air\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/batik-air">\n     Batik Air\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/beijing-capital-airlines">\n     Beijing Capital Airlines\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/belavia">\n     Belavia\n    </a>\n   </li>\n   <li>\n    <a href="/airline-reviews/berjaya-air">\n     Berjaya Air\n    </a>\n   </li>\n  </ul>\n  <ul class="items">\n   <li>\n    <a href="/

This pulls a lot more than we need, but gets everything we're looking for from Aa to Az. Now we just need to extract the website extension which is stored as href="/airline-reviews/airline-name"

In [9]:
# all A value href data
for a in col_groups.find_all('a', href=True):
    print(a['href'])
    print(a.get_text())

/airline-reviews/ba-cityflyer
BA CityFlyer
/airline-reviews/badr-airlines
Badr Airlines
/airline-reviews/bahamasair
Bahamasair
/airline-reviews/bangkok-airways
Bangkok Airways
/airline-reviews/bassaka-air
Bassaka Air
/airline-reviews/batik-air
Batik Air
/airline-reviews/beijing-capital-airlines
Beijing Capital Airlines
/airline-reviews/belavia
Belavia
/airline-reviews/berjaya-air
Berjaya Air
/airline-reviews/bh-airlines
BH Air
/airline-reviews/bhutan-airlines
Bhutan Airlines
/airline-reviews/biman-bangladesh
Biman Bangladesh Airlines
/airline-reviews/binter-canarias
Binter Canarias
/airline-reviews/bluexpress
Blu-express
/airline-reviews/blue-air
Blue Air
/airline-reviews/blue-islands
Blue Islands
/airline-reviews/blue-panorama-airlines
Blue Panorama Airlines
/airline-reviews/blue1
Blue1
/airline-reviews/bmi-regional
bmi Regional
/airline-reviews/boliviana-de-aviacin
Boliviana de Aviacion
/airline-reviews/borajet
BoraJet
/airline-reviews/boutique-air
Boutique Air
/airline-reviews/briti

In [9]:
'''
Function to loop through the alphabet and return a list of all airline references. 
The function is pretty rigid to start. May revist to add some additional flexibility
'''

import re #Regular expressions
from bs4 import BeautifulSoup # a python HTML parser
import requests
import string

def get_airlines():
    
    # set url to airline reviews a-z review page
    url = "http://www.airlinequality.com/review-pages/a-z-airline-reviews/"
    
    # request the url
    r = requests.get(url)
    
    hrefs = []
    airlines = []
    
    # 200 status code indicates success
    if(r.status_code == 200):
        
        # Now use beautiful soup to pull in data (make soup)
        soup = BeautifulSoup(r.content, 'html.parser')
        
        # loop through all uppercase letters (A-Y, no Z airline reviews)
        for letter in list(string.ascii_uppercase[:25]):
            
            # add letter to id name determined above
            id_letter = "a2z-ldr-" + letter
            col_groups = soup.find(id=id_letter)
            
            # loop through all a elements to find the href data
            for a in col_groups.find_all('a', href=True):
                hrefs.append(a['href'])
                airlines.append(a.get_text())
                
    return hrefs, airlines

In [10]:
airlines = get_airlines()

In [11]:
print(airlines[0][0])
print(airlines[1][0])

/airline-reviews/adria-airways
Adria Airways


In [13]:
print(len(airlines[0]))
print(len(airlines[1]))

468
468


In [14]:
'''
The next step will be to iterate through each href/hyperlink 
'''
sample_url = "http://www.airlinequality.com/airline-reviews/ba-cityflyer/page/1/" 

In [15]:
r = requests.get(sample_url)
r.status_code

200

In [16]:
'''
We will be looping through pages for each airline and rather than look for how many pages 
each airline has, let's check what status is returned for an out of range page number
'''

error_url = "http://www.airlinequality.com/airline-reviews/ba-cityflyer/page/10/" 

r = requests.get(error_url)
r.status_code

200

No error status is thrown when index is out of range. This means another check will need to be made to determine when to stop pulling reviews. This will be revisited below. 

In [17]:
# Now use beautiful soup to pull in data (make soup)
r = requests.get(sample_url)
soup = BeautifulSoup(r.content, 'html.parser')

In [18]:
# find an entire review structure
review = soup.find(itemprop="review")

In [19]:
# find specifically the content section of the review
content = review.find(itemprop="reviewBody").get_text()
content

'✅ Trip Verified |  London City to Mykonos. I usually love BA Cityflyer from LCY - really good airline, nice planes and plenty of room. This trip we flew Club Europe to Mykonos - a 3hour 45min flight. The service was atrocious. There is no BA lounge at LCY so really the only differentiation is the on-board product; they should really make an effort. The crew spend most of the flight chatting in the galley. They eventually came out after 50 minutes and gave us a drink. Food was good, but they did not provide ice-cream or desert as per the menu. The CSM dropped a salad at my feet, tomato all up my legs and on my shoes without an apology. The remnants just sat there for the rest of the flight being trodden into the carpet. No tea and coffee was offered. No hot towels. In fact, there was very little engagement with any passengers. This was not a bust flight, there was no turbulence or other reason for a lack of service. It really did just appear that the crew could not be bothered. Such a 

In [20]:
content[(content.find('| ')+3):] #formatting

'London City to Mykonos. I usually love BA Cityflyer from LCY - really good airline, nice planes and plenty of room. This trip we flew Club Europe to Mykonos - a 3hour 45min flight. The service was atrocious. There is no BA lounge at LCY so really the only differentiation is the on-board product; they should really make an effort. The crew spend most of the flight chatting in the galley. They eventually came out after 50 minutes and gave us a drink. Food was good, but they did not provide ice-cream or desert as per the menu. The CSM dropped a salad at my feet, tomato all up my legs and on my shoes without an apology. The remnants just sat there for the rest of the flight being trodden into the carpet. No tea and coffee was offered. No hot towels. In fact, there was very little engagement with any passengers. This was not a bust flight, there was no turbulence or other reason for a lack of service. It really did just appear that the crew could not be bothered. Such a shame when a busine

In [21]:
# other fields that we are interested in are stored separately in a reviews table

# list to store data - this will be appended to a larger list/dataframe which will have values from all reviews
record = []

# within a review, find the ratings table
table = review.find('table', attrs={'class':'review-ratings'})

# grab all records in the table and loop through the rows
for item in table.findAll("tr"):
    pair = item.findAll("td")
    
    record.append({pair[0].get_text():pair[1].get_text()})
    # pair[0] # identifier
    # pair[1] # value

In [22]:
record

[{'Aircraft': 'E190'},
 {'Type Of Traveller': 'Couple Leisure'},
 {'Cabin Flown': 'Business Class'},
 {'Route': 'London City to Mykonos'},
 {'Date Flown': 'June 2018'},
 {'Seat Comfort': '12345'},
 {'Cabin Staff Service': '12345'},
 {'Food & Beverages': '12345'},
 {'Ground Service': '12345'},
 {'Value For Money': '12345'},
 {'Recommended': 'no'}]

This is close to being a complete way record but Seat Comfort - Value for Money are coming in oddly. This is because these data items are structured differently.

In [23]:
'''
Will need to look at where class = review_star.... and count where it's class is filled. 
'''

# within a review, find the ratings table
table = review.find('table', attrs={'class':'review-ratings'})

# grab all records in the table and loop through the rows
item = table.find("td", {"class": "review-rating-stars stars"})

item

<td class="review-rating-stars stars"><span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star fill">4</span><span class="star">5</span></td>

In [24]:
item.find_all("span", {'class': "star fill"})

[<span class="star fill">1</span>,
 <span class="star fill">2</span>,
 <span class="star fill">3</span>,
 <span class="star fill">4</span>]

In [25]:
len(item.find_all("span", {'class': "star fill"}))

4

So this gets us our correct rating

In [26]:
'''
update this code and expand to make into a useful function
'''

# other fields that we are interested in are stored separately in a reviews table

# list to store data - this will be appended to a larger list/dataframe which will have values from all reviews
record = []

# within a review, find the ratings table
table = review.find('table', attrs={'class':'review-ratings'})

# grab all records in the table and loop through the rows
for item in table.findAll('tr'):
    pair = item.findAll('td')
    if item.find('td', {'class': 'review-rating-stars stars'}) is None:
        record.append({pair[0].get_text():pair[1].get_text()})
        
    else:
        record.append({pair[0].get_text(): len(pair[1].find_all('span', {'class': 'star fill'}))})
        


In [27]:
record

[{'Aircraft': 'E190'},
 {'Type Of Traveller': 'Couple Leisure'},
 {'Cabin Flown': 'Business Class'},
 {'Route': 'London City to Mykonos'},
 {'Date Flown': 'June 2018'},
 {'Seat Comfort': 4},
 {'Cabin Staff Service': 1},
 {'Food & Beverages': 3},
 {'Ground Service': 4},
 {'Value For Money': 1},
 {'Recommended': 'no'}]

Now I have a list of all the airlines, I know the page structure, and can read in a review. Next I will build the function that will take in the list of airlines and will loop through all pages and all reviews for all airlines. 

In [30]:
'''
Get Data: Takes in a list of airlines and scrapes all reviews for that list of airlines
Possible new features - include getting complete list of airlines - get only the most recent x results
'''


# add import calls in this cell
import re #Regular expressions
from bs4 import BeautifulSoup # a python HTML parser
import requests
import string
import pandas as pd

# airlines should be a list formatted as the get_airlines function returns meaning [0] is href and [1] is plain text name
def get_data(airlines):
    
    # URL structure of a reviews page
    # http://www.airlinequality.com/airline-reviews/ba-cityflyer/page/1/
    
    url_start = "http://www.airlinequality.com"
    url_end = "/page/"
    
    # Now we have a list of the urls (with out the page numbers)
    urls = [url_start + link + url_end for link in airlines[0]]
    names = airlines[1]
    
    # data - list of dictionaries where each dictionary will be a record
    out_data = []
    
    # for each airline
    for i in range(len(urls)-1):
        
        
        print(urls[i])
        
        # The next loop will need to be conditional since we don't know how many review pages each airline has
        valid_page = True
        page_num = 1
        # x = 1
        
        while valid_page:
            
            # print("airline {}".format(urls[i]))
            
            # form new url, if none type returned on request, end loop, else get content 
            url_page = str(urls[i] +str(page_num) + "/")
            
            r = requests.get(url_page)
            soup = BeautifulSoup(r.content, 'html.parser')
            
            # find all reviews on the page
            reviews = soup.find_all(itemprop="review")

            # if no reviews are found then we have exceeded the page limit
            if len(reviews) > 0 or reviews is not None:
            
                # print("page {}".format(page_num))
                               
                # loop through reviews
                for review in reviews:
                    
                    # list to store data - this will be appended to a larger list/dataframe which will have values from all reviews
                    record = {"airline": names[i], 'link': urls[i]}

                    # find specifically the content section of the review
                    content = review.find(itemprop="reviewBody").get_text()
                    content = content[(content.find('| ')+3):] #formatting

                    record.update({"content": content})

                    # within a review, find the ratings table
                    table = review.find('table', attrs={'class':'review-ratings'})

                    # grab all records in the table and loop through the rows
                    for item in table.findAll('tr'):
                        pair = item.findAll('td')
                        if item.find('td', {'class': 'review-rating-stars stars'}) is None:
                            record.update({pair[0].get_text():pair[1].get_text()})

                        else:
                            record.update({pair[0].get_text(): len(pair[1].find_all('span', {'class': 'star fill'}))})

                    out_data.append(record)
                    # print('review {}'.format(x))
                    # x += 1

                page_num += 1
            
            else:
                valid_page = False
                
    out_data_df = pd.DataFrame(out_data)
        
    return out_data_df
    
    

In [None]:
airlines = get_airlines()

'''
href = ["/airline-reviews/ba-cityflyer"]
city = ["BA CityFlyer"]

airlines = []
airlines.append(href)
airlines.append(city)
airlines
'''

results = get_data(airlines)

http://www.airlinequality.com/airline-reviews/adria-airways/page/


In [None]:
# start at 8:40

from time import gmtime, strftime
strftime("%Y-%m-%d %H:%M:%S", gmtime())

In [None]:
results.head()

results.to_csv('self_gen_airlines.csv')