## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import csv
import pandas as pd



In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0'
}

## Search companies with over 10k reviews

In [3]:
base_url_over_10k = "https://www.trustpilot.com/categories/travel_agency?sort=reviews_count&page="

companies = []

page = 1
while True:
    url = f"{base_url_over_10k}{page}"
    print(f"Scraping page {page}...")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to load page {page}")
        break

    soup = BeautifulSoup(response.text, 'lxml')
    cards = soup.find_all("a", attrs={"name": "business-unit-card"})

    if not cards:
        print("No more companies found.")
        break

    stop = False
    for card in cards:
        try:
            name = card.find("p", class_="CDS_Typography_heading-xs__bedfe1").get_text(strip=True)
            website = card.find("p", class_="styles_websiteUrlDisplayed__lSw1A").get_text(strip=True)

            reviews_text = card.find("p", class_="styles_ratingText__A2dmB")
            if not reviews_text:
                continue
            review_count_text = reviews_text.find_all("span")[-1].get_text(strip=True)
            review_count = int(review_count_text.replace(",", ""))

            if review_count >= 10000:
                companies.append({
                    "name": name,
                    "reviews": review_count,
                    "website": website
                })
            else:
                stop = True
                break
        except Exception as e:
            print(f"Error parsing company card: {e}")
            continue

    if stop:
        print("Found company with fewer than 10,000 reviews. Stopping.")
        break

    page += 1
    time.sleep(1)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Found company with fewer than 10,000 reviews. Stopping.


### Output the result

In [4]:

for i, c in enumerate(companies, 1):
    print(f"{i}. {c['name']} – {c['reviews']} reviews – {c['website']}")

print(f"\nTotal companies with over 10,000 reviews: {len(companies)}")

1. Viator.com – 260465 reviews – www.viator.com
2. JustFly – 194168 reviews – justfly.com
3. Vegas.com – 163422 reviews – www.vegas.com
4. Allianz Partners USA – 126504 reviews – www.allianztravelinsurance.com
5. Vrbo – 117752 reviews – www.vrbo.com
6. FlightHub – 110820 reviews – flighthub.com
7. ASAP Tickets – 97066 reviews – www.asaptickets.com
8. Priceline – 95359 reviews – www.priceline.com
9. Way – 75701 reviews – www.way.com
10. CheapFareGuru – 70924 reviews – cheapfareguru.com
11. AirTkt – 70488 reviews – airtkt.com
12. SmartFares – 59267 reviews – www.smartfares.com
13. Guest Reservations – 44890 reviews – guestreservations.com
14. Ship Sticks – 43361 reviews – shipsticks.com
15. Reservation Counter – 41936 reviews – reservationcounter.com
16. Headout – 39184 reviews – headout.com
17. parksleepfly.com – 39026 reviews – parksleepfly.com
18. AARDY – 35295 reviews – aardy.com
19. Global Airport Parking – 30278 reviews – globalairportparking.com
20. CheapOair.com – 26214 reviews –

## Get the data

### Function for scraping the review data from the page of a company

In [5]:
def extract_reviews(soup):
    articles = soup.find_all('article', attrs={"data-service-review-card-paper": True})
    reviews = []

    for article in articles:
        try:
            name = article.find('span', attrs={"data-consumer-name-typography": True}).get_text(strip=True)
        except:
            name = None

        try:
            country = article.find('span', attrs={"data-consumer-country-typography": True}).get_text(strip=True)
        except:
            country = None

        try:
            rating_tag = article.find('div', attrs={"data-service-review-rating": True})
            rating = int(rating_tag["data-service-review-rating"])
        except:
            rating = None

        try:
            title = article.find('h2', attrs={"data-service-review-title-typography": True}).get_text(strip=True)
        except:
            title = None

        try:
            text = article.find('p', attrs={"data-service-review-text-typography": True}).get_text(strip=True)
            if not text:
                continue  # Skip tiles with no real review
        except:
            continue  # Skip if no review text

        try:
            raw_text = article.find('p', attrs={"data-service-review-date-of-experience-typography": True}).get_text(strip=True)
            if raw_text.startswith("Date of experience:"):
                raw_date = raw_text.replace("Date of experience:", "").strip()
                date = datetime.strptime(raw_date, "%B %d, %Y").date()
            else:
                date = None
        except Exception:
            date = None

        try:
            company_reply = article.find('p', attrs={"data-service-review-business-reply-text-typography": True})
            has_reply = 1 if company_reply else 0
        except:
            has_reply = 0

        reviews.append({
            "name": name,
            "country": country,
            "rating": rating,
            "title": title,
            "text": text,
            "date_of_experience": date,
            "has_reply": has_reply
        })

    return reviews

### Go through the companies

In [None]:
all_reviews = []

# Limit the number of companies to scrape
max_companies = 50
# Create a list of company URLs
company_links = [company['website'] for company in companies]

# Loop through each company and scrape reviews
for i, company_url in enumerate(company_links):
    if i >= max_companies:
        break
    # Loop through the review pages
    # page = 1
    #while True:
    for i in range(1, 5):
        # Construct the URL for the review page
        url = f"https://www.trustpilot.com/review/{company_url}?page={page}"
        # Get the page content
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to fetch page {page} for {company_url}")
            break
        
        # Parse the page content
        soup = BeautifulSoup(response.content, 'lxml')
        # Extract reviews from the page with the function
        reviews = extract_reviews(soup)
        if not reviews:
            break
        # Append the reviews to the all_reviews list
        all_reviews.extend(reviews)
        time.sleep(2)

### Save to CSV

In [None]:
# Save reviews to CSV
csv_file = 'trustpilot_reviews.csv'

if all_reviews:
    # Open the CSV file in write mode
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        # Write as a dictionary
        writer = csv.DictWriter(f, fieldnames=all_reviews[0].keys())
        writer.writeheader()
        writer.writerows(all_reviews)
    print(f"\nSaved {len(all_reviews)} reviews to '{csv_file}'")
else:
    print("No reviews found to save.")


Saved 1070 reviews to 'trustpilot_reviews.csv'


## Display DF

In [None]:
# Path to CSV file
csv_file = 'trustpilot_reviews.csv'

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Display the first rows
df.head(25)


Unnamed: 0,name,country,rating,title,text,date_of_experience,has_reply
0,Tricia Guthrie,AU,4,So far my experience with using Viator…,So far my experience with using Viator has bee...,2025-05-09,0
1,Helen,GB,5,An interesting tour with an excellent…,An interesting tour with an excellent guide.,2025-05-11,0
2,Enis Pasha,BG,5,Great organization,The organization was great. The guide was very...,2025-05-12,0
3,Terry Hoffman,DE,5,Easy,Easy site to work with,2025-05-07,0
4,JIm Hanna,NZ,4,An unforgettable experience…,This was a great trip in almost every respect....,2025-05-12,0
5,Mark Rakoczy,US,5,Eiffel Tower visit,We enjoyed our visit to Eiffel Tower,2025-05-08,0
6,Sarah,GB,5,Our rour guide was very informative,"Our rour guide was very informative, we had ve...",2025-05-08,0
7,Tonya Peralta,US,5,Viator is easy to navigate,"Viator is easy to navigate, has a very fair ca...",2025-05-12,0
8,Laura,ES,5,Great experience and lots to see in a short time,Really nice trip. The guide was super nice and...,2025-05-17,0
9,Howard Heidenberg,AT,1,Could not contact anyone to change…,Could not contact anyone to change plans. Lost...,2025-05-12,0


#### Calculate the replies of companies to reviews (just debugging)

In [None]:
df['has_reply'].sum()

317