<a href="https://colab.research.google.com/github/steve122192/TA_review_scraper/blob/main/DDIL_Review_Scrapers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
from datetime import datetime
from datetime import timedelta

In [None]:
# Helper Functions, run first

def get_hotel_links(ta_link, n_hotels):
  url = ta_link
  page = requests.get(url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  links = page_soup.find_all(class_="property_title prominent ")
  hotel_links = []
  for link in links[:(n_hotels)]:
    hotel_links.append("https://www.tripadvisor.com" + link['href'])
  return hotel_links


def get_review_links(hotel_link, n_pages):
  url = hotel_link
  url_list = [hotel_link]
  for i in range(5,(5*(n_pages-1)+1),5):
    splits = url.split('-')
    splits.insert(splits.index('Reviews'), f'or{str(i)}')
    new_url = '-'.join(splits)
    url_list.append(new_url)
  return (url_list)

def get_ratings(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  ratings = page_soup.find_all(class_ = 'nf9vGX55')
  review_ratings = []
  for rating in ratings:
    bubbles = rating.find('span')['class'][1]
    if bubbles == 'bubble_10':
      review_rating = 1
    if bubbles == 'bubble_20':
      review_rating = 2
    if bubbles == 'bubble_30':
      review_rating = 3
    if bubbles == 'bubble_40':
      review_rating = 4
    if bubbles == 'bubble_50':
      review_rating = 5
    review_ratings.append(review_rating)
  return review_ratings


def get_dates(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  dates = page_soup.find_all(class_ = '_2fxQ4TOx')
  review_dates = []
  for date in dates:
    if date.text[-5:] == 'Today':
      today = datetime.date(datetime.now())
      review_date = (today.strftime('%b %Y'))
    elif date.text[-9:] == 'Yesterday':
      today = datetime.date(datetime.now())
      yesterday = today - timedelta(days = 1)
      review_date = (yesterday.strftime('%b %Y'))
    elif 'ew' in date.text[-8:]:
      today = datetime.date(datetime.now())
      review_date = (today.strftime('%b %Y'))
    else:
      review_date = date.text[-8:]
    review_dates.append(review_date)
  return review_dates

def get_texts(link):
  base_url = link
  page = requests.get(base_url)
  page_soup =  BeautifulSoup(page.text, 'html.parser')
  reviews = page_soup.find_all(class_ = 'oETBfkHU')
  review_texts = []
  for review in reviews:
    text = review.find(class_ = "IRsGHoPm")
    review_texts.append(text.text)
  return review_texts

def get_data(link):
  ratings = get_ratings(link)
  texts = get_texts(link)
  dates = get_dates(link)
  review_dict = {'rating': [], 'text': [], 'date': []}
  for tup in zip(ratings,texts,dates):
    review_dict['rating'].append(tup[0])
    review_dict['text'].append(tup[1])
    review_dict['date'].append(tup[2])

  df = pd.DataFrame()
  df = df.from_dict(review_dict)
  return df

In [None]:
# Trip Advisor Reviews
def get_TA_data(city,link,n_hotels=5,n_pages=2):
  '''
  Pulls review data from Trip Advisor to return a dataframe with columns
  'rating', 'text', 'date' & 'area'

  Parameters:
  city(str): label for 'area' column
  link(str): trip advisor city search results link 
             (i.e https://www.tripadvisor.com/Hotels-g34059-Wilmington_Delaware-Hotels.html)
  n_hotels(int): number of hotels to get data for (max 30)
  n_pages: number of review pages. (1 page = 5 reviews)
  '''

  hotel_links = get_hotel_links(link, n_hotels)
  print(f'Getting reviews for {len(hotel_links)} hotels')
  city_df = pd.DataFrame({'rating': [], 'text': [], 'date': []})
  for hotel_link in hotel_links:
    print(f'Getting reviews for {hotel_link}')
    review_links = get_review_links(hotel_link, n_pages)
    for review_link in review_links:
      data = get_data(review_link)
      city_df = city_df.append(data)
  city_df['area'] = city
  city_df = city_df[~city_df['text'].str.contains("…")]
  city_df = city_df.reset_index(drop=True)
  return city_df

# Yelp Reviews
def get_yelp_data(city):
  '''
  Pulls review data from Yelp to return a dataframe with columns
  'rating', 'text', 'date' & 'area'
  Parameters:
  city(str): City name and state abbreviation separated by a comms. i.e Philadelphia, PA
  '''
  api_key='NnQahaGb-ALws_WL-HmQcihO-Ub1nS6ecxZaIKnrHPRw8XNpUzQqfIc5ujZtCHSEDU9EjYLe4RA6tVxb8nIFuHdBvhGnkAoagVai4u6TccMc4YQZUplIvJNd22WpX3Yx'
  headers = {'Authorization': 'Bearer %s' % api_key}
  url='https://api.yelp.com/v3/businesses/search'
  hotel_ids=[]
  params = {'term':'hotel','location': city}
  req=requests.get(url, params=params, headers=headers)
  hotels = json.loads(req.text)
  for hotel in hotels['businesses']:
    hotel_ids.append(hotel['id'])
  print(f'Getting Yelp reviews for {len(hotel_ids)} hotels')  
  hotel_dict = {'rating': [],'text': [], 'date': []}
  for id in hotel_ids:
    url = f'https://api.yelp.com/v3/businesses/{id}/reviews'
    req = requests.get(url, headers=headers)
    res = json.loads(req.text)
    for review in res['reviews']:
      hotel_dict['text'].append(review['text'])
      hotel_dict['rating'].append(review['rating'])
      hotel_dict['date'].append(review['time_created'])
  df = pd.DataFrame.from_dict(hotel_dict)
  df['date'] = pd.to_datetime(df['date'])
  df['date'] = df['date'].apply(lambda x: x.strftime('%b %Y'))
  df['area'] = city
  return df

  # Combined

def combine_dfs(*kwargs):
  df = pd.concat(kwargs)
  df = df.reset_index(drop=True)
  return df

In [None]:
get_TA_data(city,link,2,1)

Getting reviews for 2 hotels
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d122343-Reviews-Sofitel_Philadelphia_at_Rittenhouse_Square-Philadelphia_Pennsylvania.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d3226761-Reviews-Kimpton_Hotel_Monaco_Philadelphia-Philadelphia_Pennsylvania.html


Unnamed: 0,rating,text,date,area
0,4.0,Overall the staff was friendly and professiona...,Oct 2020,philadelphia
1,5.0,The staff is amazing and so accommodating- the...,Oct 2020,philadelphia
2,4.0,I stayed at the Sofitel Oct 2020. The suite we...,Oct 2020,philadelphia
3,5.0,"For Philly first-timers, Kimpton Hotel Monaco ...",Nov 2020,philadelphia
4,5.0,My son had the presidential suite for an engag...,Nov 2020,philadelphia
5,5.0,"Stayed here for a few days in early November, ...",Nov 2020,philadelphia
6,5.0,I've stayed here several times and have every ...,Nov 2020,philadelphia
7,5.0,"Very clean, great service and outstanding staf...",Nov 2020,philadelphia


In [None]:
# test
city = 'Philadelphia, PA'
link = 'https://www.tripadvisor.com/Hotels-g60795-Philadelphia_Pennsylvania-Hotels.html'

combine_dfs(get_TA_data(city,link,2,1), get_yelp_data(city))

Getting reviews for 2 hotels
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d122343-Reviews-Sofitel_Philadelphia_at_Rittenhouse_Square-Philadelphia_Pennsylvania.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d3226761-Reviews-Kimpton_Hotel_Monaco_Philadelphia-Philadelphia_Pennsylvania.html
Getting Yelp reviews for 20 hotels


Unnamed: 0,rating,text,date,area
0,4.0,Overall the staff was friendly and professiona...,Oct 2020,"Philadelphia, PA"
1,5.0,The staff is amazing and so accommodating- the...,Oct 2020,"Philadelphia, PA"
2,4.0,I stayed at the Sofitel Oct 2020. The suite we...,Oct 2020,"Philadelphia, PA"
3,5.0,"For Philly first-timers, Kimpton Hotel Monaco ...",Nov 2020,"Philadelphia, PA"
4,5.0,My son had the presidential suite for an engag...,Nov 2020,"Philadelphia, PA"
...,...,...,...,...
63,2.0,Me and my boyfriend stayed here this weekend. ...,Oct 2020,"Philadelphia, PA"
64,5.0,Staff is very friendly here and our room was c...,Feb 2020,"Philadelphia, PA"
65,4.0,I'm really impressed\nwith the Marriott Downto...,Dec 2019,"Philadelphia, PA"
66,1.0,I did not enjoy my stay. The room smelled (wei...,Jul 2019,"Philadelphia, PA"


In [None]:
get_yelp_data(city)

Unnamed: 0,rating,text,date,area
0,5,Our stay at the Kimpton was a pleasant one des...,Oct 2020,philadelphia
1,5,The best! During COVID their cleanliness is su...,Jul 2020,philadelphia
2,5,I've stayed at the Kimpton Hotel Monaco twice ...,Aug 2020,philadelphia
3,5,My company puts us up here for 1-2 weeks at a ...,Oct 2019,philadelphia
4,4,"Pros:\n-Centrally located\n-Parks, shops, and ...",Jul 2020,philadelphia
5,4,My wife and I recently took a trip to Philly t...,Oct 2020,philadelphia
6,5,We were looking for somewhere special for my w...,Apr 2019,philadelphia
7,1,My gallery (I am an artist) reserved a room he...,Sep 2020,philadelphia
8,5,"I think for the price, location, and friendly ...",Mar 2019,philadelphia
9,5,Great brand new hotel located in one of Philly...,Oct 2020,philadelphia


In [None]:
df  = get_review_data(city,link,2,1)
from google.colab import files
df.to_csv('philly_reviews.csv')
#df.to_json('philly_reviews.json')  
files.download('philly_reviews.csv')
#files.download('philly_reviews.json')

Getting reviews for 2 hotels
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d3226761-Reviews-Kimpton_Hotel_Monaco_Philadelphia-Philadelphia_Pennsylvania.html
Getting reviews for https://www.tripadvisor.com/Hotel_Review-g60795-d122343-Reviews-Sofitel_Philadelphia_at_Rittenhouse_Square-Philadelphia_Pennsylvania.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
'test'

'test'