<a href="https://colab.research.google.com/github/steve122192/TA_review_scraper/blob/main/DDIL_SGR_Review_Scrapers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
from datetime import datetime
from datetime import timedelta

#Trip Advisor Review Scraper
###(for hotels only)


In [93]:
# Helper Functions, run first

def get_hotel_links(ta_link, n_hotels):
  url = ta_link
  page = requests.get(url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  links = page_soup.find_all(class_="property_title prominent ")
  hotel_links = []
  for link in links[:(n_hotels)]:
    hotel_links.append("https://www.tripadvisor.com" + link['href'])
  return hotel_links


def get_review_links(hotel_link, n_pages):
  url = hotel_link
  url_list = [hotel_link]
  for i in range(5,(5*(n_pages-1)+1),5):
    splits = url.split('-')
    splits.insert(splits.index('Reviews'), f'or{str(i)}')
    new_url = '-'.join(splits)
    url_list.append(new_url)
  return (url_list)

def get_ratings(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  ratings = page_soup.find_all(class_ = 'nf9vGX55')
  review_ratings = []
  for rating in ratings:
    bubbles = rating.find('span')['class'][1]
    if bubbles == 'bubble_10':
      review_rating = 1
    if bubbles == 'bubble_20':
      review_rating = 2
    if bubbles == 'bubble_30':
      review_rating = 3
    if bubbles == 'bubble_40':
      review_rating = 4
    if bubbles == 'bubble_50':
      review_rating = 5
    review_ratings.append(review_rating)
  return review_ratings


def get_dates(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  dates = page_soup.find_all(class_ = '_2fxQ4TOx')
  review_dates = []
  for date in dates:
    if date.text[-5:] == 'Today':
      today = datetime.date(datetime.now())
      review_date = (today.strftime('%b %Y'))
    elif date.text[-9:] == 'Yesterday':
      today = datetime.date(datetime.now())
      yesterday = today - timedelta(days = 1)
      review_date = (yesterday.strftime('%b %Y'))
    elif 'w' in date.text[-8:]:
      today = datetime.date(datetime.now())
      review_date = (today.strftime('%b %Y'))
    else:
      review_date = date.text[-8:]
    review_dates.append(review_date)
  return review_dates

def get_texts(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  reviews = page_soup.find_all(class_ = 'oETBfkHU')
  review_texts = []
  for review in reviews:
    text = review.find(class_ = "IRsGHoPm")
    review_texts.append(text.text)
  return review_texts

def get_data(link):
  ratings = get_ratings(link)
  texts = get_texts(link)
  dates = get_dates(link)
  review_dict = {'rating': [], 'text': [], 'date': []}
  for tup in zip(ratings,texts,dates):
    review_dict['rating'].append(tup[0])
    review_dict['text'].append(tup[1])
    review_dict['date'].append(tup[2])

  df = pd.DataFrame()
  df = df.from_dict(review_dict)
  return df

In [94]:
# Scrape Data
def get_TA_data(city,link,n_hotels=10,n_pages=3):
  '''
  Pulls review data from Trip Advisor to return a dataframe with columns
  'rating', 'text', 'date' & 'area'

  Parameters:
  city(str): label for 'area' column
  link(str): trip advisor city search results link 
             (i.e https://www.tripadvisor.com/Hotels-g34059-Wilmington_Delaware-Hotels.html)
  n_hotels(int): number of hotels to get data for (max 30)
  n_pages: number of review pages. (1 page = 5 reviews)
  '''

  hotel_links = get_hotel_links(link, n_hotels)
  print(f'Getting Trip Advisor reviews for {len(hotel_links)} hotels')
  city_df = pd.DataFrame({'rating': [], 'text': [], 'date': []})
  for hotel_link in hotel_links:
    print(f'Getting reviews for {hotel_link}')
    review_links = get_review_links(hotel_link, n_pages)
    for review_link in review_links:
      data = get_data(review_link)
      city_df = city_df.append(data)
  city_df['area'] = city
  city_df = city_df[~city_df['text'].str.contains("…")]
  city_df = city_df.reset_index(drop=True)
  return city_df

In [109]:
# test
city = 'Wilmington, DE'
link = 'https://www.tripadvisor.com/Hotels-g34059-Wilmington_Delaware-Hotels.html'

TA = get_TA_data(city,link,30,5)
TA

Getting Trip Advisor reviews for 30 hotels
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d114447-Reviews-HOTEL_DU_PONT-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d17631035-Reviews-Hyatt_Place_Wilmington_Riverfront-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d83871-Reviews-DoubleTree_by_Hilton_Hotel_Wilmington-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d83863-Reviews-Courtyard_by_Marriott_Wilmington_Downtown-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d235512-Reviews-Holiday_Inn_Express_Wilmington_North_Brandywine-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d17675169-Reviews-Homewood_Suites_by_Hilton_Wilmington_Downtown-Wilmington_Delaware.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g34059-d6000669-Re

Unnamed: 0,rating,text,date,area
0,5.0,Came to town for a wedding and found out that ...,Nov 2020,"Wilmington, DE"
1,5.0,The General Manager and staff offered exceptio...,Nov 2020,"Wilmington, DE"
2,1.0,First ... Valet is not Valet. I unloaded my ca...,Nov 2020,"Wilmington, DE"
3,1.0,My wife and I had booked a weekend stay at the...,Nov 2020,"Wilmington, DE"
4,5.0,"I enjoyed staying at this hotel , very fancy a...",Nov 2020,"Wilmington, DE"
...,...,...,...,...
587,5.0,Spectacular. As soon as you walk threw those d...,Aug 2020,"Wilmington, DE"
588,5.0,Highly recommend— had a very positive experien...,Aug 2020,"Wilmington, DE"
589,4.0,Stayed 2 night in the king suite with hot tub ...,Aug 2020,"Wilmington, DE"
590,5.0,"Beautiful views, great breakfast, clean and sa...",Aug 2020,"Wilmington, DE"


# Yelp Review Scraper
###(for hotels or restaurants)

In [96]:
# Helper Functions
def get_resto_urls(city, category='restaurants', search_radius=5, n_pages=2):
  api_key='NnQahaGb-ALws_WL-HmQcihO-Ub1nS6ecxZaIKnrHPRw8XNpUzQqfIc5ujZtCHSEDU9EjYLe4RA6tVxb8nIFuHdBvhGnkAoagVai4u6TccMc4YQZUplIvJNd22WpX3Yx'
  headers = {'Authorization': 'Bearer %s' % api_key}
  url='https://api.yelp.com/v3/businesses/search'
  resto_urls = []
  for i in range(1,((n_pages*50)+1),50):
    params = {'term': category,'location': city, 'limit': 50, 'offset': i,
              'radius': int(search_radius*1609.32) }
    req=requests.get(url, params=params, headers=headers)
    restos = json.loads(req.text)
    try:
      for resto in restos['businesses']:
        if 'price' in resto and (resto['price'] in ['$$','$$$','$$$$'] ):
          resto_urls.append(resto['url'])
    except:
      break
  return resto_urls


def get_pages(link):
  url = link
  pages = [url]
  for i in range(20,101,20):
    page = url + f'&start={str(i)}'
    pages.append(page)
  return pages  


def get_reviews(page):
  page = requests.get(page)
  page_soup = BeautifulSoup(page.text, 'html.parser')
  reviews = page_soup.find_all('div', attrs = {'class' : "lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-grid-column--8__373c0__2dUx_ border-color--default__373c0__3-ifU"})
  return reviews[1:]


def get_review_data(review):
  text = review.find('p').get_text()
  rating = int(review.find('span', attrs={'class':'lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU'}).find('div')['aria-label'][0])
  date = review.find('div', attrs={'class':'lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-fill__373c0__3Sfw1 border-color--default__373c0__3-ifU'}).get_text()
  date = datetime.strptime(date, '%m/%d/%Y')
  date = date.strftime('%b %Y')
  return {'text': text, 'rating': rating, 'date': date}

In [None]:
# Scrape Data
def get_yelp_review_data(city, category='restaurants', search_radius=5, n_pages=2, n_businesses = 20):
  '''
  Pulls review data from Yelp to return a dataframe with columns
  'rating', 'text', 'date' & 'area'

  Parameters:
  city(str): city to search within (ex. 'Wilmington, DE')
  category(str): 'restaurants' or 'hotels'
  search_radius(float): search radius from city center, in miles
  n_pages: number of review pages. (1 page = 20 reviews) 
  n_businesses(int): number of businesses to get data for
  '''
  review_dict = {'rating': [], 'text': [], 'date': []}
  resto_urls = get_resto_urls(city, category, search_radius, n_pages)
  resto_urls = resto_urls[:n_businesses]
  print(f'Getting Yelp reviews for {len(resto_urls)} {category} in {city}...')
  for resto in resto_urls:
    print(f'Getting review data for {resto}')
    pages = get_pages(resto)
    for page in pages:
      reviews = get_reviews(page)
      for review in reviews:
        try:
          data = get_review_data(review)
          review_dict['rating'].append(data['rating'])
          review_dict['text'].append(data['text'])
          review_dict['date'].append(data['date'])
        except:
          break
  df = pd.DataFrame.from_dict(review_dict)
  df['area'] = city
  return df

In [108]:
# Test
Yelp = get_yelp_review_data('Wilmington, DE', category='hotels', search_radius=5, n_pages=2, n_businesses=15)
Yelp

Getting Yelp reviews for 15 hotels in Wilmington, DE...
Getting review data for https://www.yelp.com/biz/doubletree-by-hilton-hotel-wilmington-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/homewood-suites-by-hilton-wilmington-brandywine-valley-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/the-inn-at-montchanin-village-and-spa-montchanin-2?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/doubletree-by-hilton-hotel-downtown-wilmington-legal-district-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUl

Unnamed: 0,rating,text,date,area
0,4,"Friendly customer service, nice hotel! This ho...",Apr 2019,"Wilmington, DE"
1,1,This hotel should not have the double tree nam...,Oct 2020,"Wilmington, DE"
2,4,Staff is friendly at this Hilton DoubleTree ho...,Aug 2018,"Wilmington, DE"
3,1,Bugs in bed in room 225. Manager said they w...,May 2019,"Wilmington, DE"
4,4,We had an excellent time at a corporate lunche...,Jun 2018,"Wilmington, DE"
...,...,...,...,...
498,3,"Well, I'm lying in a king size bed, firm mattr...",Nov 2012,"Wilmington, DE"
499,3,"standard place to stay, NOT a flea bag and muc...",Aug 2015,"Wilmington, DE"
500,4,"This is our first time staying in this hotel,...",Oct 2013,"Wilmington, DE"
501,1,We stayed here as a layover on our way home. ...,Jul 2016,"Wilmington, DE"


#Combine & Export Dataframes




In [110]:
# Combine hotel data
def combine_dfs(*kwargs):
  df = pd.concat(kwargs)
  df = df.reset_index(drop=True)
  return df

df = combine_dfs(Yelp, TA)
df

Unnamed: 0,rating,text,date,area
0,4.0,"Friendly customer service, nice hotel! This ho...",Apr 2019,"Wilmington, DE"
1,1.0,This hotel should not have the double tree nam...,Oct 2020,"Wilmington, DE"
2,4.0,Staff is friendly at this Hilton DoubleTree ho...,Aug 2018,"Wilmington, DE"
3,1.0,Bugs in bed in room 225. Manager said they w...,May 2019,"Wilmington, DE"
4,4.0,We had an excellent time at a corporate lunche...,Jun 2018,"Wilmington, DE"
...,...,...,...,...
1090,5.0,Spectacular. As soon as you walk threw those d...,Aug 2020,"Wilmington, DE"
1091,5.0,Highly recommend— had a very positive experien...,Aug 2020,"Wilmington, DE"
1092,4.0,Stayed 2 night in the king suite with hot tub ...,Aug 2020,"Wilmington, DE"
1093,5.0,"Beautiful views, great breakfast, clean and sa...",Aug 2020,"Wilmington, DE"


In [111]:
# Export hotel Data
from google.colab import files
df.to_csv('hotels_wilmington_area.csv')
files.download('hotels_wilmington_area.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [112]:
restos = get_yelp_review_data('Wilmington, DE', category='restaurants', search_radius=5, n_pages=3, n_businesses=50)
restos

Getting Yelp reviews for 50 restaurants in Wilmington, DE...
Getting review data for https://www.yelp.com/biz/big-fish-grill-riverfront-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/scalessas-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/stitch-house-brewery-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/bardea-food-and-drink-wilmington?adjust_creative=vSUlVCWACaWP6lBWStmSVg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=vSUlVCWACaWP6lBWStmSVg
Getting review data for https://www.yelp.com/biz/iron-hill-brewery-and-restaurant-wilmington

Unnamed: 0,rating,text,date,area
0,5,Great meal and an even greater job by Big Fish...,Nov 2020,"Wilmington, DE"
1,4,It was nice to see somebody operating under al...,Jul 2020,"Wilmington, DE"
2,4,Husband and I hadn't been to a restaurant sinc...,Aug 2020,"Wilmington, DE"
3,4,Overall a decent placeDrinks are good. Apple c...,Oct 2020,"Wilmington, DE"
4,1,We waited about an hour for an outside table (...,Oct 2020,"Wilmington, DE"
...,...,...,...,...
2509,4,Went here with my two kids and wife for lunch....,Aug 2014,"Wilmington, DE"
2510,3,I think the food is great and better than Oliv...,Jun 2014,"Wilmington, DE"
2511,4,Last night was my first time here and I must s...,Jul 2014,"Wilmington, DE"
2512,2,I ordered gnocchi which is advertised at 12.95...,Apr 2016,"Wilmington, DE"


In [113]:
# Export hotel Data
from google.colab import files
restos.to_csv('restos_wilmington_area.csv')
files.download('restos_wilmington_area.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>