In [None]:
### 1. Run Scrapy to pull attractions data and urls

### 2. Run Selenium on URLs from Scrapy Output

In [1]:
import os
import pandas as pd
import numpy as np
import random
from time import sleep
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
import time

### 3. Read in Attraction Review JSONs

In [2]:
path_to_json = 'ds/metis/finalproject/all_reviews/attractions/'
json_files = [path_to_json+pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

dataframes = []
for file in json_files:
    dataframes.append(pd.read_json(file))
    
attr_df = pd.concat(dataframes,axis=1).T

In [3]:
attr_df.columns = ['review','reviewtitle','reviewdate','username','rating']
attr_df.shape

(7258, 5)

In [4]:
newdate = []
newrating = []

for review, date, rating in zip(attr_df.review,attr_df.reviewdate,attr_df.rating):
    
    datediff = len(date)-len(review)
    ratingdiff = len(rating) - len(review)
    
    newdate.append(date[datediff:])
    newrating.append(rating[ratingdiff:])

attr_df.reviewdate = newdate
attr_df.rating = newrating

In [5]:
# Drop inconsistent length review pages?
attr_df['lenreviews'] = [len(x) for x in attr_df.review]
attr_df['lenreviewtitle'] = [len(x) for x in attr_df.reviewtitle]
attr_df['lenreviewdate'] = [len(x) for x in attr_df.reviewdate]
attr_df['lenusername'] = [len(x) for x in attr_df.username]
attr_df['lenrating'] = [len(x) for x in attr_df.rating]
attr_df = attr_df[attr_df.lenreviews==attr_df.lenusername][['review', 'reviewtitle', 'reviewdate', 'username','rating']]

In [6]:
allreviews = pd.DataFrame(attr_df.apply(lambda x: pd.Series(x['review']),axis=1).stack().reset_index(level=0, drop=False))
allreviews.columns = ['url','review']
allreviews.review = [x[0] for x in allreviews.review]

alltitles = pd.DataFrame(attr_df.apply(lambda x: pd.Series(x['reviewtitle']),axis=1).stack().reset_index(level=0, drop=False))
alltitles.columns = ['url','title']
alltitles.title = [x[0] for x in alltitles.title]

alldates = pd.DataFrame(attr_df.apply(lambda x: pd.Series(x['reviewdate']),axis=1).stack().reset_index(level=0, drop=False))
alldates.columns = ['url','date']
alldates.date = [x[0] for x in alldates.date]

allusers = pd.DataFrame(attr_df.apply(lambda x: pd.Series(x['username']),axis=1).stack().reset_index(level=0, drop=False))
allusers.columns = ['url','user']
allusers.user = [x[0] for x in allusers.user]

allratings = pd.DataFrame(attr_df.apply(lambda x: pd.Series(x['rating']),axis=1).stack().reset_index(level=0, drop=False))
allratings.columns = ['url','rating']
allratings.rating = [x[0] for x in allratings.rating]

In [7]:
allreviews.shape,alltitles.shape,alldates.shape,allusers.shape,allratings.shape

((53383, 2), (53383, 2), (53383, 2), (53383, 2), (53383, 2))

In [8]:
attr = pd.DataFrame()
attr['url'] = allreviews.url
attr['review'] = allreviews.review
attr['title'] = alltitles.title
attr['date'] = alldates.date
attr['username'] = allusers.user
attr['rating'] = allratings.rating
attr.reset_index(inplace=True,drop=True)

In [9]:
split = [x.split('{}') for x in attr.url]
attr['url'] = [x[0] for x in split]
attr['page'] = [x[1] for x in split]

In [10]:
attr.rating = [int(x.replace('ui_bubble_rating bubble_',''))/10 for x in attr.rating]

In [11]:
attr.shape

(53383, 7)

In [316]:
pd.to_pickle(attr,'attractions.pkl')

### 4. Read in Hostel Review JSONs

In [12]:
path_to_json = 'ds/metis/finalproject/all_reviews/hostels/'
json_files = [path_to_json+pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

dataframes = []
for file in json_files:
    dataframes.append(pd.read_json(file))
    
hostel_df = pd.concat(dataframes,axis=1).T#.reset_index()

In [13]:
hostel_df.columns = ['review','reviewtitle','reviewdate','username','rating']
hostel_df.shape

(5796, 5)

In [14]:
newdate = []
newrating = []

for review, date, rating in zip(hostel_df.review,hostel_df.reviewdate,hostel_df.rating):
    
    datediff = len(date)-len(review)
    ratingdiff = len(rating) - len(review)
    
    newdate.append(date[datediff:])
    newrating.append(rating[ratingdiff:])

hostel_df.reviewdate = newdate
hostel_df.rating = newrating

In [15]:
# Drop inconsistent length review pages?
hostel_df['lenreviews'] = [len(x) for x in hostel_df.review]
hostel_df['lenreviewtitle'] = [len(x) for x in hostel_df.reviewtitle]
hostel_df['lenreviewdate'] = [len(x) for x in hostel_df.reviewdate]
hostel_df['lenusername'] = [len(x) for x in hostel_df.username]
hostel_df['lenrating'] = [len(x) for x in hostel_df.rating]
hostel_df = hostel_df[hostel_df.lenreviews==hostel_df.lenusername][['review', 'reviewtitle', 'reviewdate', 'username','rating']]

In [16]:
allreviews = pd.DataFrame(hostel_df.apply(lambda x: pd.Series(x['review']),axis=1).stack().reset_index(level=0, drop=False))
allreviews.columns = ['url','review']
allreviews.review = [x[0] for x in allreviews.review]

alltitles = pd.DataFrame(hostel_df.apply(lambda x: pd.Series(x['reviewtitle']),axis=1).stack().reset_index(level=0, drop=False))
alltitles.columns = ['url','title']
alltitles.title = [x[0] for x in alltitles.title]

alldates = pd.DataFrame(hostel_df.apply(lambda x: pd.Series(x['reviewdate']),axis=1).stack().reset_index(level=0, drop=False))
alldates.columns = ['url','date']
alldates.date = [x[0] for x in alldates.date]

allusers = pd.DataFrame(hostel_df.apply(lambda x: pd.Series(x['username']),axis=1).stack().reset_index(level=0, drop=False))
allusers.columns = ['url','user']
allusers.user = [x[0] for x in allusers.user]

allratings = pd.DataFrame(hostel_df.apply(lambda x: pd.Series(x['rating']),axis=1).stack().reset_index(level=0, drop=False))
allratings.columns = ['url','rating']
allratings.rating = [x[0] for x in allratings.rating]

In [17]:
host = pd.DataFrame()
host['url'] = allreviews.url
host['review'] = allreviews.review
host['title'] = alltitles.title
host['date'] = alldates.date
host['username'] = allusers.user
host['rating'] = allratings.rating
host.reset_index(inplace=True,drop=True)

In [18]:
split = [x.split('{}') for x in host.url]
host['url'] = [x[0].replace('https://www.tripadvisor.com/','') for x in split]
host['page'] = [x[1] for x in split]

In [19]:
host.rating = [int(x.replace('ui_bubble_rating bubble_',''))/10 for x in host.rating]

In [20]:
host.shape

(17731, 7)

In [326]:
pd.to_pickle(host,'hostels.pkl')

### 5. Read in Attraction Info JSON

In [21]:
attr_info = pd.read_json('ds/metis/finalproject/attractions.json')
attr_info.columns = ['address', 'categories', 'country', 'days', 'description', 'hoursopen',
       'locality', 'numreviews', 'overallrating', 'recstay', 'name', 'url']
attr_info.head(1)

Unnamed: 0,address,categories,country,days,description,hoursopen,locality,numreviews,overallrating,recstay,name,url
0,Museo Negret,"[Art Galleries, History Museums, Shopping, Mus...",Colombia,[],[],[],"Popayan,",11,4.5,[],Museo Negret,/Attraction_Review-g319824-d8855661-Reviews-Mu...


In [22]:
attr_info

Unnamed: 0,address,categories,country,days,description,hoursopen,locality,numreviews,overallrating,recstay,name,url
0,Museo Negret,"[Art Galleries, History Museums, Shopping, Mus...",Colombia,[],[],[],"Popayan,",11,4.5,[],Museo Negret,/Attraction_Review-g319824-d8855661-Reviews-Mu...
1,El Mamey,"[Ancient Ruins, Historic Sites, Sights & Landm...",Colombia,[],[],[],470007,690,4.5,Suggested Duration: More than 3 hours,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...
2,Km 1.2 Via Leticia - Tarapaca,"[Nature & Parks, Zoos & Aquariums]",Colombia,[],[],[],"Leticia 910001,",13,5.0,Suggested Duration: 1-2 hours,Bioparque Etuena,/Attraction_Review-g317037-d9026205-Reviews-Bi...
3,Calle 40 SUR #30-67,"[Architectural Buildings, Sights & Landmarks]",Colombia,[],[],[],"Envigado,",12,5.0,Suggested Duration: 1-2 hours,La Casa de las Piedritas,/Attraction_Review-g1918553-d9730589-Reviews-L...
4,Carrera 7 con Calle 10,"[Architectural Buildings, Sights & Landmarks]",Colombia,[],[],[],"Bogota,",25,4.0,[],Palacio de San Carlos,/Attraction_Review-g294074-d590778-Reviews-Pal...
5,Plaza de la Trinidad en Getsemani,"[Churches & Cathedrals, Sights & Landmarks]",Colombia,[Mon - Fri],[],[8:00 am - 6:00 pm],"Cartagena,",506,4.5,[],Iglesia de la Trinidad,/Attraction_Review-g297476-d3215074-Reviews-Ig...
6,Calle 3 con Carrera 7,"[Cemeteries, Sights & Landmarks]",Colombia,[],[],[],"Barichara,",91,4.5,[],Cementerio Barichara,/Attraction_Review-g1178558-d7391976-Reviews-C...
7,Calle 17 No 2-43 Centro Historico,"[Gear Rentals, Scuba & Snorkeling, Boat Tours ...",Colombia,[Sun - Sat],Our diving center is located in the historic c...,[7:00 am - 8:00 pm],"Santa Marta 470006,",194,5.0,Suggested Duration: More than 3 hours,Santa Marta Dive and Adventure,/Attraction_Review-g297484-d6407294-Reviews-Sa...
8,Calle Gastelbondo,"[Boat Rentals, Boat Tours & Water Sports, Outd...",Colombia,[Sun - Sat],We offer the best private plans of boats and y...,[8:00 am - 10:00 pm],"Cartagena 130001,",283,5.0,Suggested Duration: More than 3 hours,Boats4U,/Attraction_Review-g297476-d7813050-Reviews-Bo...
9,Calle 52 #52-43,"[History Museums, Museums]",Colombia,"[Sun, Mon - Sat]",[],"[10:00 am - 4:30 pm, 10:00 am - 5:30 pm]","Medellin,",1518,4.5,[],Museo de Antioquia,/Attraction_Review-g297478-d650865-Reviews-Mus...


### 6. Read in Hostel Info JSON:

In [23]:
hostel_info = pd.read_json('ds/metis/finalproject/hostels_final.json')
hostel_info.columns = ['address', 'avgprice','country', 'ext_address', 'locality', 'numreviews',
       'overallrating', 'name', 'url']
hostel_info.head(1)

Unnamed: 0,address,avgprice,country,ext_address,locality,numreviews,overallrating,name,url
0,Playas Don Aire - Avenida Principal,$22 - $64 (Based on Average Rates for a Standa...,Colombia,[],"Palomino 446009,",406,4.5,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...


In [24]:
hostel_info

Unnamed: 0,address,avgprice,country,ext_address,locality,numreviews,overallrating,name,url
0,Playas Don Aire - Avenida Principal,$22 - $64 (Based on Average Rates for a Standa...,Colombia,[],"Palomino 446009,",406,4.5,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...
1,Avenida Los Estudiantes,$15 - $47 (Based on Average Rates for a Standa...,Colombia,"Carrera 15, 14 - 51","Riohacha 440002,",196,5.0,Happiness Hostel,Hotel_Review-g676527-d7856162-Reviews-Happines...
2,Calle 2 Oeste # 4-16 - San Antonio,$10 - $11 (Based on Average Rates for a Standa...,Colombia,[],"Cali 0000,",117,5.0,Hostal Encuentro,Hotel_Review-g297475-d6212268-Reviews-Hostal_E...
3,Carrera 6,$21 - $82 (Based on Average Rates for a Standa...,Colombia,[],"Palomino 446009,",252,4.5,PrimaLuna Beach Hostel,Hotel_Review-g3754359-d10003779-Reviews-PrimaL...
4,Calle 35 # 3-30,$21 - $60 (Based on Average Rates for a Standa...,Colombia,Plaza de Santo Domingo,"Cartagena 130001,",94,5.0,Bourbon St Hostal Boutique,Hotel_Review-g297476-d10357916-Reviews-Bourbon...
5,Calle 10d # 36-24,$10 - $36 (Based on Average Rates for a Standa...,Colombia,El Poblado,"Medellin 050021,",408,4.5,Saman Hostel Medellin,Hotel_Review-g297478-d1972387-Reviews-Saman_Ho...
6,Cr 9 # 9-06,$14 - $65 (Based on Average Rates for a Standa...,Colombia,[],"Salento 631020,",122,5.0,Coffee Tree Boutique Hostel,Hotel_Review-g1580963-d8548125-Reviews-Coffee_...
7,Barrio Los Almendros Manzana 4 Casa 3,$21 - $47 (Based on Average Rates for a Standa...,Colombia,[],"San Andres, San Andres Island,",421,4.5,Blue Almond Hostel,Hotel_Review-g3493965-d1925133-Reviews-Blue_Al...
8,Calle 14 #04-80,$14 - $84 (Based on Average Rates for a Standa...,Colombia,[],"Santa Marta 000000,",734,4.5,Masaya Hostel Santa Marta,Hotel_Review-g297484-d4813145-Reviews-Masaya_H...
9,Via Palestina KM 1.5 Finca La Serrana,$14 - $27 (Based on Average Rates for a Standa...,Colombia,[],"Salento,",460,4.5,La Serrana Eco Farm and Hostel,Hotel_Review-g1580963-d1762797-Reviews-La_Serr...


In [329]:
hostel_info.avgprice = [x.replace(' (Based on Average Rates for a Standard Room)','') if type(x)!=list else '' for x in hostel_info.avgprice]

In [330]:
minmaxprices = [x.split(' - ') for x in hostel_info.avgprice]

minprice = []
maxprice = []
for rng in minmaxprices:
    if len(rng) == 2:
        minprice.append(pd.to_numeric(rng[0].replace('$','').replace(',','')))
        maxprice.append(pd.to_numeric(rng[1].replace('$','').replace(',','')))
    else:
        minprice.append(np.nan)
        maxprice.append(np.nan)
    
hostel_info['minprice'] = minprice
hostel_info['maxprice'] = maxprice
hostel_info.drop('avgprice',inplace=True,axis=1)

In [331]:
hostel_info.head()

Unnamed: 0,address,country,ext_address,locality,numreviews,overallrating,name,url,minprice,maxprice
0,Playas Don Aire - Avenida Principal,Colombia,[],"Palomino 446009,",406,4.5,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,22.0,64.0
1,Avenida Los Estudiantes,Colombia,"Carrera 15, 14 - 51","Riohacha 440002,",196,5.0,Happiness Hostel,Hotel_Review-g676527-d7856162-Reviews-Happines...,15.0,47.0
2,Calle 2 Oeste # 4-16 - San Antonio,Colombia,[],"Cali 0000,",117,5.0,Hostal Encuentro,Hotel_Review-g297475-d6212268-Reviews-Hostal_E...,10.0,11.0
3,Carrera 6,Colombia,[],"Palomino 446009,",252,4.5,PrimaLuna Beach Hostel,Hotel_Review-g3754359-d10003779-Reviews-PrimaL...,21.0,82.0
4,Calle 35 # 3-30,Colombia,Plaza de Santo Domingo,"Cartagena 130001,",94,5.0,Bourbon St Hostal Boutique,Hotel_Review-g297476-d10357916-Reviews-Bourbon...,21.0,60.0


### 7. Combine Attraction Tables

In [332]:
attractions = attr_info.merge(attr, on='url', how='inner')
attractions = attractions[[
        'name', 'url','categories', 'description',
        'address','locality','country',    
        'days', 'hoursopen','recstay',
        'numreviews', 'overallrating',
        'title','date','review','rating','username','page']]
attractions

Unnamed: 0,name,url,categories,description,address,locality,country,days,hoursopen,recstay,numreviews,overallrating,title,date,review,rating,username,page
0,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,Expotour,"August 29, 2017",We have been to the lost city trek with expoto...,4.0,textamoebe,1
1,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,Expotour and ask for Mathias for a guide,"August 28, 2017","To start off the journey to the lost city, I b...",5.0,Nirav H,1
2,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,Perfect beginning to Colombia Trip,"August 25, 2017",When I told my friends that I was traveling to...,5.0,jandreoni,1
3,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,"Must Do, Excellent","August 22, 2017","I did this Trek in August with Expotur, our Gu...",5.0,Shane S,1
4,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,Good company but beware of campsites,"August 19, 2017",We've just got back and Expotur have been real...,4.0,Chris O,1
5,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,4 days trek to the Lost City,"August 15, 2017",We really enjoyed our trek thanks to Ariel and...,5.0,Léa L,1
6,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,La Ciudad Perdida,"August 14, 2017","I recently did the lost city trek, 4 day trek ...",3.0,Amanda C,1
7,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,Amazing hike to the Lost City,"August 7, 2017",We did the trek to the Lost City from July 27 ...,5.0,ushavas,1
8,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,lost city hike colombia,"August 7, 2017",great hike! we went with expotur in a group of...,5.0,erinkimiller,1
9,Lost City,/Attraction_Review-g3860359-d315403-Reviews-Lo...,"[Ancient Ruins, Historic Sites, Sights & Landm...",[],El Mamey,470007,Colombia,[],[],Suggested Duration: More than 3 hours,690,4.5,4-day trek to the Lost City,"August 4, 2017",This trek was amazing! I chose the 3-night/4-d...,5.0,ToriFGraham,1


In [333]:
attractions.overallrating = pd.to_numeric(attractions.overallrating)

In [334]:
for col in attractions.columns:
    newcol = []
    for x in attractions[col]:
        if isinstance(x,list):
            if x == []:
                newcol.append('')
            else:
                newcol.append(x[0])
        else:
            newcol.append(x)
        
    attractions[col] = newcol

In [335]:
pd.to_pickle(attractions,'attractions_all.pkl')

### 8. Combine Hostel Tables

In [336]:
hostels = hostel_info.merge(host, on='url', how='inner')
hostels = hostels[[
        'name', 'url',
        'address','ext_address','locality','country',
        'numreviews','overallrating','minprice','maxprice',
        'title','date','review','rating','username','page']]

In [337]:
hostels

Unnamed: 0,name,url,address,ext_address,locality,country,numreviews,overallrating,minprice,maxprice,title,date,review,rating,username,page
0,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,"clean place, no soul...","February 28, 2016","the hostel is clean, the place is nice and the...",3.0,Ivan Z,12
1,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Best of the Best,"February 27, 2016",Best place and people in palomino. After dew d...,5.0,Mat P,12
2,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Nice looking hostel in Palomino close to the b...,"February 22, 2016",Tiki hut has dorms and private huts who look v...,4.0,Laura I,12
3,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Chilled hostal close to the beach,"February 11, 2016","Chilled environment, helpful staff. We stayed ...",5.0,Liam J,12
4,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Lovely place to recharge,"February 7, 2016",We came to Tiki Hut after trekking through Par...,5.0,Ada J,12
5,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,"Good place near beach, nice staff, good food,","February 2, 2016","I stayed 3 nights. This is a nice hostel, quie...",4.0,LBopreis,13
6,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Just great!,"February 1, 2016","We stayed in the Camarones doorms, which is fo...",5.0,Camila F,13
7,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Incredible hostel,"January 22, 2016",Been traveling around Colombia for a month now...,5.0,Vicente O,13
8,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,"Nice Hostal - Great staff, good breakfast","January 17, 2016",We had very nice two nights here and can reall...,5.0,Hannes182,13
9,The Tiki Hut Hostel Palomino,Hotel_Review-g3754359-d6403091-Reviews-The_Tik...,Playas Don Aire - Avenida Principal,[],"Palomino 446009,",Colombia,406,4.5,22.0,64.0,Great service combined with even better food.,"January 9, 2016",The only downside was that I was only able to ...,5.0,Daniel G,13


In [338]:
hostels.overallrating = pd.to_numeric(hostels.overallrating)

In [339]:
for col in hostels.columns:
    newcol = []
    for x in hostels[col]:
        if isinstance(x,list):
            if x == []:
                newcol.append('')
            else:
                newcol.append(x[0])
        else:
            newcol.append(x)
        
    hostels[col] = newcol

In [340]:
pd.to_pickle(hostels,'hostels_all.pkl')

### 9. Bring in User Reviews

In [2]:
def getReviewLinks(username):
    driver = webdriver.Firefox()
    driver.get('https://www.tripadvisor.com/members/'+username)
    
    links = []
    status = ''
    while status != 'disabled':
        WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH,'//div[@class="cs-review-location"]/a')))
        status = driver.find_elements_by_xpath('//button[@id="cs-paginate-next"]')[0].get_attribute('class')
        try:
            links += [x.get_property('href') for x in driver.find_elements_by_xpath('//a[@class="cs-review-title"]')]
            driver.find_elements_by_xpath('//button[@id="cs-paginate-next"]')[0].click()
        except:
            break
    driver.close()
    
    return list(set(links))

In [3]:
def getReviews(username,verbose=False):
    t0 = time.time()
    
    userLinks = getReviewLinks(username)
    
    for i,url in enumerate(userLinks):
        
        print("scraping: ",url)
        pagenum = 0
        
        try:
            driver = webdriver.Firefox()
            driver.get(url)
        except:
            driver.close()
            driver = webdriver.Firefox()
            driver.get(url)
        
        reviewpages = {}

        t1 = time.time()

        pagenum += 1

        # wait for page to load and scrape reviews, titles, dates and usernames
#         WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,'div.prw_rup.prw_reviews_text_summary_hsx div.entry p.partial_entry')))

        # find and click "more" button to expand review text 
        try:
            driver.find_element_by_xpath('//span[@class="taLnk ulBlueLinks"]').click()
        except:
            print("No 'more' button found.")
            pass

        # save review data
#         WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,'div.prw_rup.prw_reviews_text_summary_hsx div.entry p.partial_entry')))

        try:

            try:
                reviews = driver.find_elements_by_xpath('//div[@class="entry"]//p')[0].text
            except:
                reviews = driver.find_elements_by_xpath('//div[@class="entry"]//p').text
            try:
                cats = [x.get_attribute('class') for x in driver.find_elements_by_xpath('//li[@class="nav-sub-item"]/a')]
                activecat = cats.index(' global-nav-link ui_tab active')
                categories = driver.find_elements_by_xpath('//li[@class="nav-sub-item"]/a')[activecat].get_attribute('data-tracking-label')
            except:
                categories = ''
            try:
                reviewtitles = driver.find_elements_by_xpath('//div[@class="quote"]')[0].text
            except:
                reviewtitles = driver.find_elements_by_xpath('//div[@class="quote"]').text
            try:
                reviewdates = driver.find_elements_by_xpath('//span[@class="ratingDate"]')[0].get_attribute('content')
            except:
                reviewdates = driver.find_elements_by_xpath('//span[@class="ratingDate relativeDate"]')[0].get_attribute('content')
            try:
                usernames = driver.find_elements_by_xpath('//div[@class="username mo"]')[0].text
            except:
                usernames = driver.find_elements_by_xpath('//div[@class="username mo"]').text
            try:
                ratings = driver.find_elements_by_xpath('//div[@class="rating reviewItemInline"]/span[1]')[0].get_attribute("class")
            except:
                ratings = driver.find_elements_by_xpath('//div[@class="rating reviewItemInline"]/span[1]').get_attribute("class")
            
            # store page in dictionary
            reviewpages[url] = [reviews,categories,reviewtitles,reviewdates,usernames,ratings]
                        
            if verbose == True:
                print(reviewpages[url])

            with open('/Users/marc/ds/metis/finalproject/all_reviews/user_reviews/{0}.json'.
                      format(username+'_'+str(i)+'_'+str(pagenum)), 'w') as fp:
                json.dump(reviewpages, fp)

            print("Review #",pagenum)

            t2 = time.time()
            print(np.round(t2-t1,3),"sec")
            
            driver.close()
                
        except:
            next

            driver.close()

    tf = time.time()
    print("Total run time:",np.round(tf-t0,3),"sec elapsed")

In [4]:
getReviews('johanp158')

scraping:  https://www.tripadvisor.com/ShowUserReviews-g295424-d1436120-r502881400-Bur_Dubai_Abra_Dock-Dubai_Emirate_of_Dubai.html
Review # 1
2.782 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g297914-d1156694-r505177029-Joe_s_Steakhouse-Khao_Lak_Phang_Nga_Province.html
Review # 1
2.225 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d12631460-r503564164-Barril-Copenhagen_Zealand.html
Review # 1
2.301 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d7726327-r507928160-Ski_Portillo_Centro_de_Esqui-Santiago_Santiago_Metropolitan_Region.html
Review # 1
1.882 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d548642-r503339904-Oresund_Bridge-Copenhagen_Zealand.html
Review # 1
3.47 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d245024-r503870930-The_Little_Mermaid_Den_Lille_Havfrue-Copenhagen_Zealand.html
Review # 1
2.37 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303506-d2352254-r50430

Review # 1
3.703 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187147-d314450-r506586337-Observatoire_Panoramique_de_la_Tour_Montparnasse-Paris_Ile_de_France.html
Review # 1
3.553 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d7035462-r503335273-Kayak_Bar-Copenhagen_Zealand.html
Review # 1
1.846 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g1720850-d5657099-r503190923-Padaria_Senhor_Seu_Bonzinho-Cairu_State_of_Bahia.html
Review # 1
3.606 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d7359398-r508853653-Fritz-Santiago_Santiago_Metropolitan_Region.html
Review # 1
3.804 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d314523-r507930238-Barrio_Bellavista-Santiago_Santiago_Metropolitan_Region.html
Review # 1
4.57 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303506-d311272-r504293392-Theatro_Municipal_do_Rio_de_Janeiro-Rio_de_Janeiro_State_of_Rio_de_Janeiro.html
Review # 1
2.917 sec
scrapi

Review # 1
1.02 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d3609859-r508852077-P_F_Chang_s-Santiago_Santiago_Metropolitan_Region.html
Review # 1
2.709 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g297914-d3656466-r505173557-Amici_Italian_Bistro-Khao_Lak_Phang_Nga_Province.html
Review # 1
2.042 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303272-d312079-r503000869-Pelourinho-Salvador_State_of_Bahia.html
Review # 1
1.968 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303319-d2408108-r502996835-Museu_Vale-Vila_Velha_State_of_Espirito_Santo.html
Review # 1
2.126 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187147-d246750-r506572420-Trocadero-Paris_Ile_de_France.html
Review # 1
2.149 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d207301-r503862977-Amalienborg-Copenhagen_Zealand.html
Review # 1
2.232 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g488168-d8805439-r503333173-Praia

Review # 1
3.805 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303319-d10807919-r502998961-Museu_Garoto-Vila_Velha_State_of_Espirito_Santo.html
Review # 1
3.88 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294265-d8634325-r502858181-Supertree_Grove-Singapore.html
Review # 1
5.473 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d197738-r503337003-Tivoli_Gardens-Copenhagen_Zealand.html
Review # 1
3.455 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187793-d191109-r505149760-St_Peter_s_Square_Piazza_San_Pietro-Vatican_City_Lazio.html
Review # 1
2.318 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187793-d4358077-r505162090-Museo_del_Tesoro_della_Basilica_di_San_Pietro-Vatican_City_Lazio.html
Review # 1
4.815 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d4267183-r507927315-Teatro_Municipal_Las_Condes-Santiago_Santiago_Metropolitan_Region.html
Review # 1
4.276 sec
scraping:  https://www.tripadviso

Review # 1
2.226 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187793-d190130-r505159029-Sistine_Chapel-Vatican_City_Lazio.html
Review # 1
1.676 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g303506-d311247-r504292320-Sugarloaf_Mountain-Rio_de_Janeiro_State_of_Rio_de_Janeiro.html
Review # 1
1.786 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g187791-d190121-r502924099-Piazza_Navona-Rome_Lazio.html
Review # 1
2.017 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d6953921-r508467865-Ruby_Tuesday-Santiago_Santiago_Metropolitan_Region.html
Review # 1
2.021 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g189541-d4606443-r503871567-Changing_of_her_Majesty_s_guard_at_Amalienborg_Castle-Copenhagen_Zealand.html
Review # 1
1.81 sec
scraping:  https://www.tripadvisor.com/ShowUserReviews-g294305-d314481-r507925332-Cerro_San_Cristobal-Santiago_Santiago_Metropolitan_Region.html
Review # 1
1.597 sec
scraping:  https://www.tripadvi

In [341]:
# getReviews('Carriemybags')

In [None]:
# getReviews('mofin')

In [5]:
path_to_json = '/Users/marc/ds/metis/finalproject/all_reviews/user_reviews/'
json_files = [path_to_json+pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

dataframes = []
for file in json_files:
    dataframes.append(pd.read_json(file))
    
user_reviews = pd.concat(dataframes,axis=1).T
user_reviews.columns = ['review','category','title','date','username','rating']

In [6]:
user_reviews.rating = [int(x.replace('ui_bubble_rating bubble_',''))/10 for x in user_reviews.rating]

In [7]:
user_reviews[user_reviews.category=='attractions']

Unnamed: 0,review,category,title,date,username,rating
https://www.tripadvisor.com/ShowUserReviews-g188666-d318644-r494124916-STAM_Ghent_City_Museum-Ghent_East_Flanders_Province.html,"Finally made it to the STAM, Lots of neat stuf...",attractions,“The History of Ghent-a fine Museum”,2017-06-18,CFEIII,5.0
https://www.tripadvisor.com/ShowUserReviews-g298442-d311728-r444785532-Lake_Titicaca-Puno_Puno_Region.html,Seeing the Lake from a boat is the best way to...,attractions,“a ride out to the islands was an interesting ...,2016-12-17,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g294314-d553981-r440472782-Q_enqo-Cusco_Cusco_Region.html,This site was a ceremonial site for the elite ...,attractions,"“Imagine this place covered in gold, sacred to...",2016-11-27,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g297476-d555214-r469846913-Walled_City_of_Cartagena-Cartagena_Cartagena_District_Bolivar_Department.html,"Cartagena has to be one of my favorite cities,...",attractions,"“an incredible city to see, architechure, hist...",2017-03-24,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g274951-d6826153-r508596250-Vilnius_with_Locals-Vilnius_Vilnius_County.html,I took the local tour with Leva. I thought she...,attractions,“Great Local Tour with Leva”,,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g676524-d3432740-r464791017-Archaeological_Site_El_Infiernito-Villa_de_Leyva_Boyaca_Department.html,The ancient Muiscas used this ceremonial site ...,attractions,“Interesting solar observatory and religious s...,2017-03-05,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g503900-d218248-r487853166-Deal_Castle-Deal_Kent_England.html,I visited this castle to get a first hand look...,attractions,"“A beautiful castle, well worthy of a visit, e...",2017-05-25,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g188666-d7261538-r493890150-Vrijdagmarkt-Ghent_East_Flanders_Province.html,"This is an interesting area to visit, with a m...",attractions,"“A Market square in the morning, a beer drinke...",2017-06-17,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g294074-d2556747-r467368362-Museo_Historico_de_la_Policia-Bogota.html,"If you have a bit of time while in Bogota, dow...",attractions,“Interesting museum with Pablo Escobar Capture...,2017-03-14,CFEIII,4.0
https://www.tripadvisor.com/ShowUserReviews-g274958-d7728868-r512113273-Freedom_Square-Tallinn_Harju_County.html,"Freedom square was on the way to my hostel, so...",attractions,"“Classic square, lots of interesting stuff in ...",,CFEIII,4.0


In [8]:
pd.to_pickle(user_reviews,'userreviews.pkl')

In [9]:
## aggregate user reviews
userreviews = []
usernum = []
for username in user_reviews.username.unique():
    userreviews.append(user_reviews[user_reviews.username==username].review.str.cat())
    usernum.append(username)