### Web Scrapper

In [11]:
import requests
from parsel import Selector
import urllib.parse
import re
import json

def extract_pagename(hotel_url):
    """Extract the pagename from a Booking.com hotel URL"""
    parsed = urllib.parse.urlparse(hotel_url)
    path = parsed.path
    # Example: /hotel/in/trident-nariman-point.html
    match = re.search(r'/hotel/.+?/(.+?)\.html', path)
    return match.group(1) if match else None

def get_review_page_html(pagename, offset=0, rows=10, country_code='in', lang='en-us'):
    """Download raw review page HTML"""
    base = "https://www.booking.com/reviewlist.html"
    params = {
        'cc1': country_code,
        'lang': lang,
        'pagename': pagename,
        'rows': rows,
        'offset': offset,
        'type': 'total',
        'sort': 'f_recent_desc'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    response = requests.get(base, params=params, headers=headers)
    response.raise_for_status()
    return response.text

def parse_reviews(html):
    """Parse review page HTML into structured reviews"""
    selector = Selector(text=html)
    reviews = []
    for review in selector.css('.review_list_new_item_block'):
        get = lambda sel: review.css(sel).get(default='').strip()
        get_all = lambda sel: ' '.join([t.strip() for t in review.css(sel).getall()]).strip()

        reviews.append({
            'score': get('.bui-review-score__badge::text'),
            'title': get('.c-review-block__title::text'),
            'date': get('.c-review-block__date::text'),
            'user_name': get('.bui-avatar-block__title::text'),
            'user_country': get('.bui-avatar-block__subtitle::text'),
            'text': get_all('.c-review__body ::text'),
            'lang': get('.c-review__body::attr(lang)')
        })
    return reviews

def scrape_all_reviews(hotel_url, max_pages=2):
    pagename = extract_pagename(hotel_url)
    if not pagename:
        raise ValueError("Could not extract pagename from URL")

    print(f"Scraping reviews for pagename: {pagename}")

    all_reviews = []
    for page in range(max_pages):
        offset = page * 10
        html = get_review_page_html(pagename, offset=offset)
        reviews = parse_reviews(html)
        if not reviews:
            break
        all_reviews.extend(reviews)
        print(f"Scraped {len(reviews)} reviews from page {page+1}")

    return all_reviews

# EXAMPLE USE
if __name__ == '__main__':
    HOTEL_URL = 'https://www.booking.com/hotel/in/trident-nariman-point.html'
    reviews = scrape_all_reviews(HOTEL_URL, max_pages=10)

    print(json.dumps(reviews, indent=2, ensure_ascii=False))


Scraping reviews for pagename: trident-nariman-point
Scraped 10 reviews from page 1
Scraped 10 reviews from page 2
Scraped 10 reviews from page 3
Scraped 10 reviews from page 4
Scraped 10 reviews from page 5
Scraped 10 reviews from page 6
Scraped 10 reviews from page 7
Scraped 10 reviews from page 8
Scraped 10 reviews from page 9
Scraped 10 reviews from page 10
[
  {
    "score": "10",
    "title": "Exceptional",
    "date": "July 2025",
    "user_name": "Mouri",
    "user_country": "Bangladesh",
    "text": "There are no comments available for this review",
    "lang": "en"
  },
  {
    "score": "10",
    "title": "Exceptional",
    "date": "June 2025",
    "user_name": "Cassandra",
    "user_country": "India",
    "text": "There are no comments available for this review",
    "lang": "en"
  },
  {
    "score": "10",
    "title": "Exceptional",
    "date": "May 2025",
    "user_name": "Anonymous",
    "user_country": "United Arab Emirates",
    "text": "There are no comments available