## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import csv
import pandas as pd



In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0'
}

## Search companies with over 10k reviews

In [None]:
# Base URL for the Trustpilot travel agency category, sorted by review count, paginated
base_url_over_10k = "https://www.trustpilot.com/categories/travel_agency?sort=reviews_count&page="

# This list will store companies that meet the 10,000+ reviews threshold
companies = []

# Start from the first page
page = 1

while True:
    url = f"{base_url_over_10k}{page}"
    print(f"Scraping page {page}...")

    # Send GET request to the page with headers
    response = requests.get(url, headers=headers)

    # Exit loop if the page fails to load
    if response.status_code != 200:
        print(f"Failed to load page {page}")
        break

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'lxml')

    # Extract all company cards (each one is an <a> element with name='business-unit-card')
    cards = soup.find_all("a", attrs={"name": "business-unit-card"})

    # If no company cards found, we've reached the end
    if not cards:
        print("No more companies found.")
        break

    # Flag to determine whether we should stop after this page
    stop = False

    for card in cards:
        try:
            # Extract the company name
            name = card.find("p", class_="CDS_Typography_heading-xs__bedfe1").get_text(strip=True)

            # Extract the company website
            website = card.find("p", class_="styles_websiteUrlDisplayed__lSw1A").get_text(strip=True)

            # Locate the tag containing the review count
            reviews_text = card.find("p", class_="styles_ratingText__A2dmB")
            if not reviews_text:
                continue  # Skip if missing

            # Get the last <span> inside the ratingText block (it contains review count)
            review_count_text = reviews_text.find_all("span")[-1].get_text(strip=True)

            # Convert string (e.g., "12,345") to integer
            review_count = int(review_count_text.replace(",", ""))

            # If review count is 10,000 or more, add to results
            if review_count >= 10000:
                companies.append({
                    "name": name,
                    "reviews": review_count,
                    "website": website
                })
            else:
                # If we find a company below 10k, assume rest are smaller and stop
                stop = True
                break

        except Exception as e:
            # Catch any parsing error and skip the current card
            print(f"Error parsing company card: {e}")
            continue

    if stop:
        print("Found company with fewer than 10,000 reviews. Stopping.")
        break

    # Go to next page
    page += 1

    # Delay to avoid being blocked (can be lowered, but 1s is safe)
    time.sleep(1)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Found company with fewer than 10,000 reviews. Stopping.


### Output the result

In [None]:
# Print each company with its index, name, review count, and website
for i, c in enumerate(companies, 1):
    print(f"{i}. {c['name']} – {c['reviews']} reviews – {c['website']}")

# Print the total number of companies found with over 10,000 reviews
print(f"\nTotal companies with over 10,000 reviews: {len(companies)}")

1. Viator.com – 260646 reviews – www.viator.com
2. JustFly – 194180 reviews – justfly.com
3. Vegas.com – 163448 reviews – www.vegas.com
4. Allianz Partners USA – 126680 reviews – www.allianztravelinsurance.com
5. Vrbo – 117753 reviews – www.vrbo.com
6. FlightHub – 110842 reviews – flighthub.com
7. ASAP Tickets – 97096 reviews – www.asaptickets.com
8. Priceline – 95379 reviews – www.priceline.com
9. Way – 75701 reviews – www.way.com
10. CheapFareGuru – 70924 reviews – cheapfareguru.com
11. AirTkt – 70488 reviews – airtkt.com
12. SmartFares – 59271 reviews – www.smartfares.com
13. Guest Reservations – 44893 reviews – guestreservations.com
14. Ship Sticks – 43384 reviews – shipsticks.com
15. Reservation Counter – 41983 reviews – reservationcounter.com
16. Headout – 39221 reviews – headout.com
17. parksleepfly.com – 39038 reviews – parksleepfly.com
18. AARDY – 35330 reviews – aardy.com
19. Global Airport Parking – 30278 reviews – globalairportparking.com
20. CheapOair.com – 26256 reviews –

## Get the data

### Function for scraping the review data from the page of a company

In [None]:
# Function to extract reviews from a BeautifulSoup-parsed review page
def extract_reviews(soup):
    # Find all review article elements on the page
    articles = soup.find_all('article', attrs={"data-service-review-card-paper": True})
    reviews = []

    # Loop through each review article
    for article in articles:
        try:
            # Extract reviewer's name
            name = article.find('span', attrs={"data-consumer-name-typography": True}).get_text(strip=True)
        except:
            name = None # Skip if name of reviewer is empty

        try:
            # Extract reviewer's country
            country = article.find('span', attrs={"data-consumer-country-typography": True}).get_text(strip=True)
        except:
            country = None # Skip if review country field is empty

        try:
            # Extract the rating as an integer from the data attribute
            rating_tag = article.find('div', attrs={"data-service-review-rating": True})
            rating = int(rating_tag["data-service-review-rating"])
        except:
            rating = None # Skip if review rating is empty

        try:
            # Extract review title
            title = article.find('h2', attrs={"data-service-review-title-typography": True}).get_text(strip=True)
        except:
            title = None # Skip if review title is empty

        try:
            # Extract review text
            text = article.find('p', attrs={"data-service-review-text-typography": True}).get_text(strip=True)
            if not text:
                continue  # Skip if review text is empty (helps ignoring recent reviews block)
        except:
            continue  # Skip this review if there's no text

        try:
            # Extract and parse the date of experience, if available
            raw_text = article.find('p', attrs={"data-service-review-date-of-experience-typography": True}).get_text(strip=True)
            if raw_text.startswith("Date of experience:"):
                raw_date = raw_text.replace("Date of experience:", "").strip()
                date = datetime.strptime(raw_date, "%B %d, %Y").date()
            else:
                date = None
        except Exception:
            date = None

        try:
            # Check if the company replied to the review
            company_reply = article.find('p', attrs={"data-service-review-business-reply-text-typography": True})
            has_reply = 1 if company_reply else 0
        except:
            has_reply = 0

        # Add the extracted data as a dictionary to the reviews list
        reviews.append({
            "name": name,
            "country": country,
            "rating": rating,
            "title": title,
            "text": text,
            "date_of_experience": date,
            "has_reply": has_reply,
            "company": company_url,  # Assumes company_url is defined outside this function
        })

    # Return the list of extracted reviews
    return reviews

### Go through the companies

In [35]:
all_reviews = []

# Limit the number of companies to scrape
max_companies = 60
# Create a list of company URLs
company_links = [company['website'] for company in companies]

# Loop through each company and scrape reviews
for i, company_url in enumerate(company_links):
    if i >= max_companies:
        break
    # Loop through the review pages
    page = 1
    while page <= 20:
        # Construct the URL for the review page
        url = f"https://www.trustpilot.com/review/{company_url}?page={page}"
        # Get the page content
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to fetch page {page} for {company_url}, status code: {response.status_code}")
            break
        
        # Parse the page content
        soup = BeautifulSoup(response.content, 'lxml')
        # Extract reviews from the page with the function
        reviews = extract_reviews(soup)
        if not reviews: 
            break
        # Append the reviews to the all_reviews list
        all_reviews.extend(reviews)
        page += 1
        time.sleep(0.5)  # Sleep to avoid overwhelming the server

Failed to fetch page 20 for www.cardelmar.com, status code: 404


### Save to CSV

In [36]:
# Save reviews to CSV
csv_file = 'trustpilot_reviews_2.csv'

if all_reviews:
    # Open the CSV file in write mode
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        # Write as a dictionary
        writer = csv.DictWriter(f, fieldnames=all_reviews[0].keys())
        writer.writeheader()
        writer.writerows(all_reviews)
    print(f"\nSaved {len(all_reviews)} reviews to '{csv_file}'")
else:
    print("No reviews found to save.")


Saved 21098 reviews to 'trustpilot_reviews_2.csv'


## Display DF

In [None]:
# Path to CSV file
csv_file = 'trustpilot_reviews_2.csv'

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Display the first rows
df.head(25)

Unnamed: 0,name,country,rating,title,text,date_of_experience,has_reply,company
0,Przemysław Rosuł,PL,1,Ignoring the specified pickup time,Ignoring the specified pickup time. Providing ...,2025-05-13,0,www.viator.com
1,kathy mrozek,US,5,Athens Evening food tour with Katrina,"Katrina, the tour guide was fabulous! She mad...",2025-05-15,0,www.viator.com
2,MARY MURRAY,US,5,All you need for travel with confidence,A great app! My excursions have been awesome! ...,2025-05-12,0,www.viator.com
3,K Shay Smith,MX,1,I booked the excursion and paid the…,I booked the excursion and paid the money up f...,2025-05-12,0,www.viator.com
4,Jeff Paine,US,5,Great food tour of Prague. Great conversation ...,Great food tour of Prague,2025-05-07,0,www.viator.com
5,Parthasara Narayanan,HU,5,Easy checkin,Easy checkin. Good evening cruise.,2025-05-11,0,www.viator.com
6,Petya Topuzova,BG,5,Great experience,"Great experience, great guide, thank you Simone!",2025-05-09,0,www.viator.com
7,Victoria Buynevich,IT,4,Perfect Amalfi day trip,Everything was very well organised. Erika was ...,2025-05-09,0,www.viator.com
8,Sahar Zaidoon Sabri,IT,5,Amazing wine tasting,"We went on a wine tasting, it was better than ...",2025-05-10,0,www.viator.com
9,Olive,IE,5,Chocolate tasting tour with Ian,Our tour guide Ian made the tour so much fun. ...,2025-05-17,0,www.viator.com
