<h1><center>Scraping the Austin-Bergstrom International Airport Yelp Page</h1></center>

In [8]:
import pandas as pd
import numpy as np
import time, re, requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import ssl

# Webscraper Class
Not the prettiest thing I've ever made... but it works. Sometimes that's as much as we can ask for. 

In [42]:
class yelpScraper: 
    
    def __init__(self):
        self.base_url = "https://www.yelp.com/biz/austin-bergstrom-international-airport-aus-austin?start="
        self.order_by = "&sort_by=date_desc"
        self.rate_limit = 5
        
    #####################
    # Tidying Functions #
    #####################
    
    def html_cleaner(self, html):
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        clean = re.sub(cleaner, '', html)
        cleant = clean.replace('\xa0', '')
        return clean
    
    ######################
    # Getting Attributes #
    ######################
    
    def get_reviews(self, soup):
        review_list = []
        reviews = soup.find_all('span', class_ = 'raw__373c0__3rcx7')
        reviews = str(reviews).rsplit('</span>')
        for i in reviews:
            review_list.append(self.html_cleaner(i))
        return review_list

    def get_stars(self, soup):
        rating_list = []
        stars = soup.find_all('span', class_ = 'display--inline__373c0__2SfH_ border-color--default__373c0__30oMI')
        a = str(stars).rsplit('aria-label="')
        for i in a[1:]:
            rating_list.append(i[0])
        return rating_list
    
    def dataframeify(self, li, col_title, name):
        if (col_title.lower() == "reviews"):
            df = pd.DataFrame(li[4:],columns=["Reviews"])
            df = df[~df.Reviews.str.contains("3600 Presidential Blvd")]
            df = df[~df.Reviews.str.contains("Austin, TX 78719")]
            df['Reviews'] = df['Reviews'].str[1:]
        else: 
            df = pd.DataFrame(li,columns=[col_title])
        df['Time'] = str(name)
        return df
        
    ##################
    # Main Function #
    #################
    
    def get_all(self, start, end, name):
        
        reviews = []
        stars = []
        
        for i in range(start, end, 10):
            
            # Get Soup
            r = requests.get(self.base_url + str(i) + self.order_by)
            soup = BeautifulSoup(r.content, 'html.parser')
            
            #Get Reviews
            temp_reviews = self.get_reviews(soup)
            reviews += temp_reviews
            
            # Get Stars
            temp_stars = self.get_stars(soup)
            stars += temp_stars
            
            # Sleep to avoid Yelp getting mad at me
            time.sleep(self.rate_limit)
        
        stars_df = self.dataframeify(stars, "Stars", name)
        reviews_df = self.dataframeify(reviews, "Reviews", name)
        return((stars_df, reviews_df))
            

# Get the Data
To avoid having to extract the dates from the Yelp reviews, I checked which pages (sorted by date) contained comments posted during our time periods (before construction started, during construciton, after it ended). The data for each is scrapd and labeled seprately. Unfortunately, because of this, the code will not be replicable later on because we're not selecting by date itself. If we had more time, I would have liked to extract the dates as well and directly transformed them into before/during/after. That would have made for a sleeker webscraper, too. Alas. It's been a semester. However, as of the day I scraped these reviews, the dates were correct and although this code will not be super useful in it's current form later on, the CSVs contain the data to reproduce our results. 

In [None]:
get_ab_reviews = yelpScraper()
stars_after, reviews_after = get_ab_reviews.get_all(40, 250, "After")
stars_during, reviews_during = get_ab_reviews.get_all(250, 740, "During")
stars_before, reviews_before = get_ab_reviews.get_all(740, 950, "Before")

# Saving the Data to CSVs

In [82]:
# Things I missed cleaning the first time:
#     There are a few phrases that look like they are repeated on every page that I didn't notice before,
#     namely, "Start your review of Austin-Bergstrom International Apirport...." - So! I remove those lines
#     in the loop below. I also drop empty rows.

def fix_dataframe(df):
    df = df[~df.Reviews.str.contains("Your trust is our top concern, so")]
    df = df[~df.Reviews.str.contains("Start your review of Austin-Bergstrom")]
    df = df[df.Reviews != '']
    return df

# Fix the reviews dataframes
reviews_before_fixed = fix_dataframe(reviews_before)
reviews_during_fixed = fix_dataframe(reviews_during)
reviews_after_fixed = fix_dataframe(reviews_after)

# Combine all the dataframes into a single dataframe 
all_reviews = pd.concat([reviews_before_fixed, reviews_during_fixed, reviews_after_fixed])
all_stars = pd.concat([stars_before, stars_during, stars_after])

# Save them to CSVs
all_reviews.to_csv("yelp_reviews.csv", encoding='utf-8')
all_stars.to_csv("yelp_stars.csv", encoding='utf-8')


# Average Star Rating
The data isn't very big so I'm just going to do this part in Python. 

In [84]:
all_stars['Stars'] = all_stars['Stars'].astype(float)
all_stars.groupby('Time').mean()

Unnamed: 0_level_0,Stars
Time,Unnamed: 1_level_1
After,3.672199
Before,3.946667
During,3.647727
