In [None]:
# Imports

import re #Regular expressions
from bs4 import BeautifulSoup # a python HTML parser
import requests
import string
import pandas as pd

In [1]:
'''
get_airlines(): Function to loop through the alphabet and return a list of all airline references. 
The function is pretty rigid to start. May revist to add some additional flexibility
'''

def get_airlines():
    
    # set url to airline reviews a-z review page
    url = "http://www.airlinequality.com/review-pages/a-z-airline-reviews/"
    
    # request the url
    r = requests.get(url)
    
    hrefs = []
    airlines = []
    
    # 200 status code indicates success
    if(r.status_code == 200):
        
        # Now use beautiful soup to pull in data (make soup)
        soup = BeautifulSoup(r.content, 'html.parser')
        
        # loop through all uppercase letters (A-Y, no Z airline reviews hence the try)
        for letter in list(string.ascii_uppercase[:26]):
            try:
                # add letter to id name determined above
                id_letter = "a2z-ldr-" + letter
                col_groups = soup.find(id=id_letter)

                # loop through all a elements to find the href data
                for a in col_groups.find_all('a', href=True):
                    hrefs.append(a['href'])
                    airlines.append(a.get_text())
                
    return hrefs, airlines

In [37]:
'''
get_data(airlines): function that takes in airlines (a list formatted like the get_airlines function return value, meaning [0] is href list 
and [1] is plain text name list) and returns the below data for all reviews on the given airlines. Data returned is a data frame with column
headers matching the data names below

'airline_name' - i.e. Delta Airlines
'link' - href showing airline-reviews/airline-name
'title' - title of the review
'author' - author name
'author_country' - author country
'date_published' - date review published
'date' - date of flight
'content' - review text
'aircraft' - type of aircraft flown
'type_traveller' - type of traveler (i.e. business, leisure, etc)
'cabin_flown' - cabin flown during trip
'route' - route flown
'overall_rating' - 1- 10 rating for the experience
'seat_comfort_rating' - 1- 5 rating
'cabin_staff_rating' - 1- 5 rating
'food_beverages_rating' - 1 - 5 rating
'inflight_entertainment_rating' - 1 - 5 rating
'ground_service_rating' - 1 - 5 rating
'wifi_connectivity_rating' - 1 - 5 rating
'value_money_rating' - 1 - 5 rating
'recommended' - 1 if yes, would recommend, 0 if no, would not recommend
'''

def get_data(airlines):
    
    # URL structure of a reviews page
    # http://www.airlinequality.com/airline-reviews/ba-cityflyer/page/1/
    url_start = "http://www.airlinequality.com"
    url_end = "/page/"
    
    # make a list of the urls (with out the page numbers)
    urls = [url_start + link + url_end for link in airlines[0]]
    
    # store list of names for easier referencing
    names = airlines[1]
    
    # data - list of dictionaries where each dictionary will be a record
    out_data = []
    
    # for each airline
    for i in list(range(len(urls))):
        
        # print current airline url to monitor status - will only show once per airline, not once per page
        print(urls[i])
        
        # set flag and counter for conditional loop because number of pages per airline is unknown
        valid_page = True
        page_num = 1
        # x = 1 # counter used to track total reviews per airline during debug or to show status
        
        # while a page is still showing reviews (reviews > 0 and reviews is not None)
        while valid_page:
            
            # form new url, if none type returned on request, end loop, else get content 
            url_page = str(urls[i] +str(page_num) + "/")
            
            # request the page and "make soup!"
            r = requests.get(url_page)
            soup = BeautifulSoup(r.content, 'html.parser')
            
            # find all reviews on the page
            reviews = soup.find_all(itemprop="review")

            # if no reviews are found then we have exceeded the page limit
            if len(reviews) > 0 and reviews is not None:
            
                # loop through reviews
                for review in reviews:
                    
                    # dictionary to store data - this will be appended to a larger list/(eventual)dataframe which will have values from all reviews
                    record = {"airline": names[i], 'link': urls[i]}

                    # find specifically the content section of the review
                    content = review.find(itemprop="reviewBody").get_text()
                    content = content[(content.find('| ')+3):] #formatting
                    
                    # overall_rating section of the review
                    overall = review.find(itemprop="ratingValue").get_text()

                    # title
                    title = review.find('h2', {'class': 'text_header'}).get_text()
                    
                    # author, author country, and date_published
                    author, author_country, date_published = review.find('h3', {'class': "text_sub_header userStatusWrapper"}).get_text().strip().replace('(', ')').split(sep=')') # ~PyThOnIc~
                    
                    # add key value pairs to records                    
                    record.update({'author': author, 'author_country': author_country, 'date_published': date_published, 'title': title, "overall_rating": overall, "content": content})
                    
                    # now look for data in review rating table
                    table = review.find('table', attrs={'class':'review-ratings'})
                    
                    # grab all records in the review rating table and loop through the rows
                    if table is not None: # theoretically this shouldn't happen but one page was throwing an error here due to a NoneType
                        for item in table.findAll('tr'):
                            # table structure on reviews has 2 td values for every tr, the first td is the field and the second td is the value
                            pair = item.findAll('td')
                            
                            # some items in this table are as 1-5 star ratings, if that's the case an extra step is needed
                            if item.find('td', {'class': 'review-rating-stars stars'}) is None:
                                record.update({pair[0].get_text():pair[1].get_text()})

                            else:
                                record.update({pair[0].get_text(): len(pair[1].find_all('span', {'class': 'star fill'}))})

                        # add this record to the larger data set
                        out_data.append(record)

                    # print('review {}'.format(x))
                    # x += 1
                # increment the page number because all reviews on the page have been read
                page_num += 1
            
            # len reviews 0 or reviews is NoneType, then move to next airline
            else:
                valid_page = False
    
    # convert dictionaries to data frame
    out_data_df = pd.DataFrame(out_data)
    
    # rename dataframe to align to original dataset names and to generally be more nicely formatted
    out_data_df = out_data_df.rename(columns={'Aircraft': 'aircraft', 'Cabin Flown': 'cabin_flown', 'Cabin Staff Service':'cabin_staff_rating', 'Date Flown':'date',
       'Food & Beverages': 'food_beverages_rating', 'Ground Service': 'ground_service_rating', 'Inflight Entertainment': 'inflight_entertainment_rating',
       'Recommended': 'recommended', 'Route': 'route', 'Seat Comfort': 'seat_comfort_rating', 'Type Of Traveller': 'type_traveller',
       'Value For Money': 'value_money_rating', 'Wifi & Connectivity': 'wifi_connectivity_rating', 'airline': 'airline_name', 'content': 'content', 'link': 'link'})
    
    # Convert recommended to numbers
    out_data_df['recommended'] = [1 if r == 'yes' else 0 for r in out_data_df.recommended]
    
    print('Done! - Here is your data')
        
    return out_data_df

In [40]:
from time import gmtime, strftime
print('start time: ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()))

airlines = get_airlines()
'''
# single airline sample
href = ["/airline-reviews/sata-air-azores"]
airline = ["SATA Air Azores"]

airlines = []
airlines.append(href)
airlines.append(airline)
print(airlines)
'''
results = get_data(airlines)

print('end time: ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-07-26 14:58:20
http://www.airlinequality.com/airline-reviews/adria-airways/page/
http://www.airlinequality.com/airline-reviews/aegean-airlines/page/
http://www.airlinequality.com/airline-reviews/aer-lingus/page/
http://www.airlinequality.com/airline-reviews/aero-vip/page/
http://www.airlinequality.com/airline-reviews/aerocaribbean/page/
http://www.airlinequality.com/airline-reviews/aeroflot-russian-airlines/page/
http://www.airlinequality.com/airline-reviews/aerolineas-argentinas/page/
http://www.airlinequality.com/airline-reviews/aeromar/page/
http://www.airlinequality.com/airline-reviews/aeromexico/page/
http://www.airlinequality.com/airline-reviews/aerosur/page/
http://www.airlinequality.com/airline-reviews/africa-world-airlines/page/
http://www.airlinequality.com/airline-reviews/afriqiyah-airways/page/
http://www.airlinequality.com/airline-reviews/aigle-azur/page/
http://www.airlinequality.com/airline-reviews/air-algerie/page/
http://www.airlinequality.com/airline-reviews/air-

In [89]:
results.columns

Index(['Aircraft', 'Cabin Flown', 'Cabin Staff Service', 'Date Flown',
       'Food & Beverages', 'Ground Service', 'Inflight Entertainment',
       'Recommended', 'Route', 'Seat Comfort', 'Type Of Traveller',
       'Value For Money', 'Wifi & Connectivity', 'airline', 'content', 'link'],
      dtype='object')

In [90]:
results.shape

(79782, 16)

In [47]:
# Set up file read/folder structure
import os
os.chdir("C:\\Users\\erroden\\Desktop\\ML_Guild\\mlg_02_us\\Capstone")
results.to_csv("data\\airline_2.csv")