In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time

In [17]:
def trustpilot_scraper(PATH: str, n_pages):
    #Lists
    body = []
    heading = []
    rating = []
    location = []
    author = []
    date = []

    #Website Load
    page = "{}?page=".format(PATH)

    for page_number in range(1, n_pages+1):
        url = "{x}{y}".format(x = page, y = page_number)
        req = requests.get(url)
        time.sleep(2)
        soup = BeautifulSoup(req.text, 'html.parser')

        #initial reviews
        reviews_raw = soup.find("script", id = "__NEXT_DATA__").string
        reviews_raw = json.loads(reviews_raw)
        rev = reviews_raw["props"]["pageProps"]["reviews"]

        #get reviews into df
        for i in range(len(rev)):
            instance = rev[i]
            
            body_ = instance["text"]
            heading_ = instance["title"]
            rating_ = instance["rating"]
            location_ = instance["consumer"]["countryCode"]
            author_ = instance["consumer"]["displayName"]
            date_ = pd.to_datetime(instance["dates"]["publishedDate"]).strftime("%Y-%m-%d")

            #append to the list
            body.append(body_)
            heading.append(heading_)
            rating.append(rating_)
            location.append(location_)
            author.append(author_)
            date.append(date_)
    
    df = {
        'Date' : date,
        'Author' : author,
        'Body' : body,
        'Heading' : heading, 
        'Rating' : rating, 
        'Location' : location 
         
    }
    rev_df = pd.DataFrame(df)
    rev_df.sort_values(by = "Date", axis = 0, inplace = True, ignore_index = True)
    rev_df.drop_duplicates(subset=["Body"],keep= 'first', inplace= True)
    rev_df.reset_index(drop = True, inplace = True)

    return rev_df




In [21]:
df = trustpilot_scraper("https://uk.trustpilot.com/review/www.tui.co.uk", 100)

In [22]:
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location
0,2020-07-06,Patricia Palmer,Think they need to contact people about all th...,Think they need to contact people about…,1,GB
1,2020-07-06,Andy Tims,Just atrocious lack of customer service. They ...,Just atrocious lack of customer service,1,GB
2,2020-07-06,Alexis,If I could do no stars I would.\nHad email to ...,If I could do no stars I would.,1,GB
3,2020-07-06,Peter Kerr,"Tui are totally useless, they are not connecta...",Tui are totally useless,1,GB
4,2020-07-06,Charlene,"Shocking customer service, not even a courtesy...",Incompetent,1,GB
...,...,...,...,...,...,...
1994,2021-11-03,Beverley,Always very helpful.,Always very helpful.,5,GB
1995,2021-11-03,Mr Johnathan Gay,I was on hold for over half an hour and then g...,I was on hold for over half an hour and…Not Happy,1,GB
1996,2021-11-03,Joanne,Conflicting advice from different advisors reg...,Conflicting advice from different…,2,GB
1997,2021-11-03,John richards,My Holliday was for may 2020 this was cancelle...,My Holliday was for may 2020 this was…,1,GB


In [23]:
df.to_excel("Tui_TrustPilot.xlsx")
df.to_csv("Tui_TrustPilot.csv")