In [None]:
!pip install selenium pandas webdriver-manager
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import os
os.environ["PATH"] += os.pathsep + "/usr/bin"


Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3-none-any.

In [None]:
import time
import random  # Helps in adding random wait times to simulate human-like behavior.
import pandas as pd
import re
from datetime import datetime, timedelta
from selenium import webdriver #The Selenium WebDriver interacts with the Booking.com website.
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By #Helps in locating elements on the webpage
from selenium.webdriver.common.keys import Keys #Used for simulating keyboard actions (Keys.PAGE_DOWN for scrolling).
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC #Used to wait until elements appear to avoid errors.

class BookingScraper:
    def __init__(self):
        print("\nInitializing scraper...")
        #Creates and configures Chrome browser settings.
        chrome_options = Options()
        chrome_options.add_argument('--headless=new') #Runs Chrome in headless mode (no GUI) to make scraping more efficient.
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--window-size=1920,1080') # Ensures full-page loading.
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('--disable-infobars')
        chrome_options.add_argument('--disable-popup-blocking') #Prevents pop-ups from interfering.
        chrome_options.add_argument('--ignore-certificate-errors') #Avoids SSL certificate issues
        chrome_options.add_argument('--log-level=3')  # Suppress most of the console logging
        chrome_options.add_argument(f'user-agent={self.get_random_user_agent()}') #Sets a random user agent to reduce detection as a bot.

        # Add a page load timeout
        self.page_load_timeout = 30

        # Try to initialize the driver with retry logic
        max_retries = 3
        for attempt in range(max_retries):
            try:
                self.driver = webdriver.Chrome(options=chrome_options)
                self.driver.set_page_load_timeout(self.page_load_timeout)
                print("Chrome browser initialized successfully")
                break
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"Browser initialization failed (attempt {attempt+1}/{max_retries}): {e}")
                    time.sleep(2)
                else:
                    raise Exception(f"Failed to initialize Chrome browser after {max_retries} attempts: {e}")

        self.base_url = "https://www.booking.com"
        self.total_hotels_collected = 0 #Keeps track of the number of hotels extracted.

    #Rotates between multiple user agents to prevent being blocked.
    def get_random_user_agent(self):
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
        ]
        return random.choice(user_agents)

    def print_hotel_info(self, hotel_data, index):
        """Print detailed information about a single hotel"""
        print(f"\nHotel #{index + 1}:")
        print("─" * 50)
        print(f"Name: {hotel_data.get('name', 'Not found')}")
        print(f"Price: {hotel_data.get('price', 'Not found')}")
        print(f"Rating: {hotel_data.get('rating', 'Not found')}")

        stars = hotel_data.get('stars', '0 stars')
        if stars == "0 stars":
            print(f"Stars: {stars} (Unclassified/No formal star rating)")
        else:
            print(f"Stars: {stars}")

        print(f"Address: {hotel_data.get('address', 'Not found')}")
        if hotel_data.get('distance'):
            print(f"Distance: {hotel_data['distance']}")
        if hotel_data.get('room_type'):
            print(f"Room Type: {hotel_data['room_type']}")
        if hotel_data.get('review_count'):
            print(f"Review Count: {hotel_data['review_count']}")
        print("─" * 50)

    #Processes a list of hotels and extracts relevant information from them.
    #Saves the data in a dictionary (hotel_data).
    def extract_hotels_from_page(self, hotel_elements, hotels_list):
        processed_hotels = []

        for idx, hotel in enumerate(hotel_elements):
            try:
                hotel_data = {}

                # Extract name
                try:
                    name = WebDriverWait(hotel, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='title']"))
                    ).text.strip()
                    hotel_data['name'] = name
                except Exception as e:
                    print(f"Could not extract name: {str(e)}")
                    hotel_data['name'] = "Name not available"

                # Extract price
                try:
                    price = WebDriverWait(hotel, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='price-and-discounted-price']"))
                    ).text.strip()
                    hotel_data['price'] = price
                except Exception as e:
                    print(f"Could not extract price: {str(e)}")
                    hotel_data['price'] = "Price not available"

                # Extract address
                try:
                    address = WebDriverWait(hotel, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='address']"))
                    ).text.strip()
                    hotel_data['address'] = address
                except Exception as e:
                    print(f"Could not extract address: {str(e)}")
                    hotel_data['address'] = "Address not available"

                # Extract distance
                try:
                    distance_element = hotel.find_element(By.CSS_SELECTOR, "[data-testid='distance']")
                    distance = distance_element.text.strip()
                    hotel_data['distance'] = distance
                except Exception as e:
                    hotel_data['distance'] = "Distance not available"

                # Extract rating
                try:
                    # First try to find the score element
                    rating_element = WebDriverWait(hotel, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='review-score']"))
                    )
                    rating = rating_element.text.strip()
                    hotel_data['rating'] = rating

                    # Scan the rating text itself for reviews count
                    # This handles cases like "8.7 Excellent 3,982 reviews"
                    review_count_from_rating = None
                    review_pattern = re.search(r'([\d,]+)\s+review', rating)
                    if review_pattern:
                        review_count_from_rating = review_pattern.group(0)
                        print(f"Found review count in rating text: {review_count_from_rating}")

                    # Also try to get the review count from a separate element
                    try:
                        # Try different selectors for review count
                        review_count = None

                        # First check if we already found it in the rating text
                        if review_count_from_rating:
                            review_count = review_count_from_rating
                        else:
                            # Look for the specific review count element
                            try:
                                review_count_element = hotel.find_element(By.CSS_SELECTOR, "[data-testid='review-score-count']")
                                if review_count_element:
                                    review_count = review_count_element.text.strip()
                            except:
                                pass

                        # If still not found, look for text containing "reviews" anywhere in the hotel card
                        if not review_count or review_count == "":
                            # Broader search across the entire hotel card
                            all_text_elements = hotel.find_elements(By.XPATH, ".//div")
                            for elem in all_text_elements:
                                try:
                                    text = elem.text.strip()
                                    if 'review' in text.lower():
                                        # Look for patterns like "7,189 reviews" or "3,982 reviews"
                                        review_match = re.search(r'([\d,]+)\s+review', text)
                                        if review_match:
                                            review_count = review_match.group(0)
                                            print(f"Found review count from element text: {review_count}")
                                            break
                                except:
                                    continue

                        # Store the review count
                        hotel_data['review_count'] = review_count if review_count else "No reviews available"

                        # Try to extract the numeric count for easier analysis
                        if review_count:
                            try:
                                count_match = re.search(r'([\d,]+)', review_count)
                                if count_match:
                                    # Remove commas and convert to integer
                                    numeric_count = int(count_match.group(1).replace(',', ''))
                                    hotel_data['numeric_review_count'] = numeric_count
                            except:
                                pass

                    except Exception as e:
                        print(f"Could not extract review count from separate element: {str(e)}")
                        # If we found it in the rating text, use that
                        if review_count_from_rating:
                            hotel_data['review_count'] = review_count_from_rating
                        else:
                            hotel_data['review_count'] = "No reviews available"

                    # Save the numeric score separately for easier analysis
                    try:
                        score_match = re.search(r'(\d+\.\d+|\d+)', rating)
                        if score_match:
                            numeric_score = float(score_match.group(1))
                            hotel_data['numeric_score'] = numeric_score
                    except:
                        pass

                except Exception as e:
                    print(f"Could not extract rating: {str(e)}")
                    hotel_data['rating'] = "No rating available"

                # Extract star rating
                try:
                    stars = None
                    try:
                        star_container = hotel.find_element(By.CSS_SELECTOR, "div[data-testid='rating-stars']")
                        if star_container:
                            # Count SVG elements inside (each SVG is a star)
                            svg_stars = star_container.find_elements(By.CSS_SELECTOR, "span svg")
                            if svg_stars and 0 < len(svg_stars) <= 5:
                                stars = len(svg_stars)
                                print(f"Found {stars} stars using svg count in rating-stars container")
                    except:
                        pass

                    # Alternative selector with aria-label that indicates star rating
                    if not stars:
                        try:
                            star_elements = hotel.find_elements(By.CSS_SELECTOR, "div[aria-label*='out of 5']")
                            for element in star_elements:
                                aria_label = element.get_attribute("aria-label")
                                if aria_label and "out of 5" in aria_label:
                                    # Extract the number from something like "4 out of 5"
                                    match = re.search(r'(\d+)\s+out\s+of\s+5', aria_label)
                                    if match:
                                        stars = int(match.group(1))
                                        print(f"Found {stars} stars from aria-label: {aria_label}")
                                        break
                        except:
                            pass

                    # Try a broader approach - look for any elements containing star information in text
                    if not stars:
                        try:
                            # Look for elements with text containing stars
                            star_text_elements = hotel.find_elements(By.XPATH,
                                ".//*[contains(text(), 'star') or contains(text(), '-star') or contains(text(), '★')]")
                            for elem in star_text_elements:
                                text = elem.text.strip()
                                star_match = re.search(r'(\d+)[\s-]*(star|★)', text.lower())
                                if star_match:
                                    stars = int(star_match.group(1))
                                    print(f"Found {stars} stars from text: {text}")
                                    break
                        except:
                            pass

                    # If we have a rating but no stars, make an estimation based on rating
                    if not stars and hotel_data.get('rating'):
                        try:
                            # Try to extract numeric rating
                            rating_text = hotel_data.get('rating', '')
                            numeric_match = re.search(r'(\d+(\.\d+)?)', rating_text)
                            if numeric_match:
                                rating_value = float(numeric_match.group(1))
                                # Estimate stars based on rating value (scale of 10)
                                if rating_value >= 9.0:
                                    stars = 5
                                elif rating_value >= 8.0:
                                    stars = 4
                                elif rating_value >= 7.0:
                                    stars = 3
                                elif rating_value >= 6.0:
                                    stars = 2
                                else:
                                    stars = 1
                                print(f"Estimated {stars} stars based on numeric rating: {rating_value}")
                        except:
                            pass

                    # Final assignment of stars value
                    if stars and stars > 0:
                        hotel_data['stars'] = f"{stars} stars"
                    else:
                        hotel_data['stars'] = "0 stars"

                except Exception as e:
                    print(f"Star extraction error: {str(e)}")
                    hotel_data['stars'] = "0 stars"

                # Extract room type
                try:
                    # Try multiple selectors for room type
                    room_type = None

                    # First try the primary data-testid
                    try:
                        room_element = hotel.find_element(By.CSS_SELECTOR, "[data-testid='recommended-units'] h4")
                        if room_element:
                            room_type = room_element.text.strip()
                    except:
                        pass

                    # Alternative selector
                    if not room_type:
                        try:
                            room_elements = hotel.find_elements(By.CSS_SELECTOR, "h4[role='link']")
                            for elem in room_elements:
                                text = elem.text.strip()
                                if text and ("room" in text.lower() or "suite" in text.lower() or
                                            "queen" in text.lower() or "king" in text.lower() or
                                            "double" in text.lower() or "single" in text.lower()):
                                    room_type = text
                                    break
                        except:
                            pass

                    # Another approach - look for accommodation type
                    if not room_type:
                        try:
                            type_element = hotel.find_element(By.CSS_SELECTOR, "[data-testid='accommodation-type-name']")
                            room_type = type_element.text.strip()
                        except:
                            pass

                    hotel_data['room_type'] = room_type if room_type else "Room type not available"

                except Exception as e:
                    print(f"Room type extraction error: {str(e)}")
                    hotel_data['room_type'] = "Room type not available"


                hotels_list.append(hotel_data)
                processed_hotels.append(hotel_data)
                self.total_hotels_collected += 1

                # Print information about current hotel
                self.print_hotel_info(hotel_data, len(hotels_list)-1)
                print(f"Successfully extracted data for hotel #{len(hotels_list)}")

                # Break if we've collected 100 hotels
                if len(hotels_list) >= 100:
                    print("Reached target of 100 hotels!")
                    break

            except Exception as e:
                print(f"Error extracting hotel #{idx+1}: {str(e)}")
                continue

        return processed_hotels

    def extract_hotels(self):
        """Extract hotel information from the page with pagination"""
        hotels = []
        print("\nExtracting hotel information from page...")

        try:
            # First check that the page is actually loaded with content
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
            )

            # Wait for hotel cards to load
            try:
                print("Waiting for property cards to load...")
                WebDriverWait(self.driver, 20).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-testid='property-card']"))
                )
            except Exception as e:
                print(f" Property cards wait timed out: {e}")
                # Try an alternative approach - get page source and check
                page_source = self.driver.page_source
                if "property-card" not in page_source:
                    print("No property cards found in page source")
                    if "captcha" in page_source.lower():
                        print(" CAPTCHA detected! Booking.com may be blocking the scraper")
                    return []

            # Get initially loaded hotel elements
            hotel_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-testid='property-card']")
            print(f"Found {len(hotel_elements)} hotels initially")

            # If no hotels found with the primary selector, try alternatives
            if len(hotel_elements) == 0:
                print("⚠ No hotels found with primary selector, trying alternatives...")
                hotel_elements = self.driver.find_elements(By.CSS_SELECTOR, ".a826ba81c4")
                print(f"Found {len(hotel_elements)} hotels with alternative selector")

                # If still no hotels, check for error messages
                if len(hotel_elements) == 0:
                    error_messages = self.driver.find_elements(By.CSS_SELECTOR, ".fe_banner__message")
                    if error_messages:
                        print(f"Error message on page: {error_messages[0].text}")
                    return []

            # Try to click "Load more results" button until we have 100+ hotels or no more results
            attempts = 0
            max_attempts = 10  # Maximum number of times to try loading more results

            while len(hotels) < 100 and attempts < max_attempts:
                # First scroll down to load any lazy-loaded elements
                self.scroll_to_load()

                # Process the current set of hotel elements
                current_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-testid='property-card']")
                print(f"Found {len(current_elements)} hotels before processing")

                # Extract hotel data from current elements
                self.extract_hotels_from_page(current_elements, hotels)

                # Break if we've reached our target
                if len(hotels) >= 100:
                    break

                # Try to find and click the "Load more results" button
                try:
                    print("Looking for 'Load more results' button...")

                    # Try various selectors for the load more button
                    load_more_selectors = [
                        "button[data-testid='loading-btn']",
                        "button.loading-btn",
                        ".e4adce92df", # Class from your screenshot
                        "button.a83ed08757", # Class from your screenshot
                        "button.c21c56c305", # Another class from your screenshot
                        "button[role='button'] span:contains('Load more results')",
                        "button:contains('Load more')",
                        "button:contains('Show more')",
                        "button:contains('Load more results')"
                    ]

                    load_more_button = None
                    for selector in load_more_selectors:
                        try:
                            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                            for element in elements:
                                if element.is_displayed() and ("load more" in element.text.lower() or
                                                              "show more" in element.text.lower() or
                                                              "more results" in element.text.lower()):
                                    load_more_button = element
                                    print(f"Found load more button with selector: {selector}")
                                    break
                            if load_more_button:
                                break
                        except:
                            continue

                    # If button not found with CSS selectors, try XPATH with partial text matching
                    if not load_more_button:
                        try:
                            xpath_button = self.driver.find_element(By.XPATH, "//button[contains(., 'Load more')]")
                            if xpath_button.is_displayed():
                                load_more_button = xpath_button
                                print("Found load more button with XPATH text search")
                        except:
                            pass

                    # If we found a button, click it
                    if load_more_button:
                        print("Clicking 'Load more results' button...")
                        # Scroll to the button to make sure it's in view
                        self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", load_more_button)
                        time.sleep(1)

                        # Try different methods to click the button
                        try:
                            # Try regular click
                            load_more_button.click()
                        except:
                            try:
                                # Try JavaScript click
                                self.driver.execute_script("arguments[0].click();", load_more_button)
                            except Exception as e:
                                print(f"Failed to click button: {e}")
                                # If can't click, simulate a page down key press
                                self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

                        print("Waiting for new hotels to load...")
                        time.sleep(5)  # Wait for content to load

                        # Check if we got new hotels
                        new_elements = self.driver.find_elements(By.CSS_SELECTOR, "[data-testid='property-card']")
                        if len(new_elements) > len(current_elements):
                            print(f"Loaded more hotels! Now have {len(new_elements)} hotels")
                        else:
                            print("No new hotels loaded, might have reached the end")
                            attempts += 1  # Increment counter even if no new hotels
                    else:
                        print("'Load more results' button not found, might have reached the end")
                        break

                except Exception as e:
                    print(f"⚠ Error trying to load more results: {e}")
                    attempts += 1

                # Always increment attempts to avoid infinite loops
                attempts += 1

            print(f"\nSuccessfully collected information for {len(hotels)} hotels")
            return hotels

        except Exception as e:
            print(f"Error in hotel extraction: {str(e)}")
            return []

    def scroll_to_load(self):
        """Scroll the page to load all hotels"""
        print("Scrolling page to load all hotels...")
        try:
            # First check if page has body element
            body = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Perform gradual scrolling to mimic human behavior
            total_height = self.driver.execute_script("return document.body.scrollHeight")
            viewport_height = self.driver.execute_script("return window.innerHeight")
            scrolls_needed = max(3, int(total_height / viewport_height))

            for i in range(scrolls_needed):
                # Calculate scroll position (gradually increasing)
                scroll_position = int((i+1) * total_height / scrolls_needed)

                # Scroll to position
                self.driver.execute_script(f"window.scrollTo(0, {scroll_position});")

                # Add a small random delay to seem more human-like
                time.sleep(random.uniform(0.5, 1.5))

                # Occasionally check if we need to update the total height (dynamic content)
                if i % 2 == 0:
                    new_height = self.driver.execute_script("return document.body.scrollHeight")
                    if new_height > total_height:
                        total_height = new_height
                        scrolls_needed = max(scrolls_needed, int(total_height / viewport_height))

                # Check if there's a load more button in view and we're near the bottom
                if i >= scrolls_needed - 2:
                    try:
                        load_buttons = self.driver.find_elements(By.XPATH, "//button[contains(., 'Load more')]")
                        if load_buttons:
                            print("Found load more button while scrolling")
                    except:
                        pass

            # Final scroll to the very bottom
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)

        except Exception as e:
            print(f"Scrolling operation failed: {e}")

    def save_to_csv(self, data, filename):
        """Save data to CSV file"""
        try:
            cleaned_data = []
            for hotel in data:
                hotel_copy = hotel.copy()
                cleaned_data.append(hotel_copy)

            df = pd.DataFrame(cleaned_data)
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"\n Saved {len(cleaned_data)} records to {filename}")
            print(f"Saved columns: {', '.join(df.columns.tolist())}")
        except Exception as e:
            print(f"Error saving file: {str(e)}")

    def run_scraping(self):
        """Run the scraping process"""
        all_data = []
        dates = self.generate_dates()

        print(f"\nStarting data collection for {len(dates)} searches...")

        for i, date_info in enumerate(dates, 1):  # Remove the [:5] to run all searches
            print(f"\n{'='*60}")
            print(f"Search {i}/{len(dates)}")
            print(f"Check-in date: {date_info['check_in']}")
            print(f"Check-out date: {date_info['check_out']}")
            print(f"TTT: {date_info['ttt']} days")
            print(f"LOS: {date_info['los']} nights")

            try:
                search_url = self._build_search_url(date_info)
                print(f"Loading URL: {search_url}")

                # Use a try-except block with retry logic for getting the URL
                max_retries = 3
                for attempt in range(max_retries):
                    try:
                        self.driver.get(search_url)
                        # Wait for essential elements to confirm page loaded
                        WebDriverWait(self.driver, 15).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='property-card']"))
                        )
                        print("Page loaded successfully")
                        break
                    except Exception as e:
                        if attempt < max_retries - 1:
                            print(f"Page load attempt {attempt+1}/{max_retries} failed: {e}")
                            # Try to refresh the browser state
                            try:
                                self.driver.execute_script("window.stop();")
                            except:
                                pass
                            time.sleep(3)
                        else:
                            print(f"Failed to load page after {max_retries} attempts")
                            # Try to recover by creating a new driver instance
                            try:
                                print("Attempting to reset browser...")
                                self.driver.quit()
                                time.sleep(2)
                                chrome_options = Options()
                                chrome_options.add_argument('--headless=new')
                                chrome_options.add_argument('--no-sandbox')
                                chrome_options.add_argument('--disable-dev-shm-usage')
                                chrome_options.add_argument('--window-size=1920,1080')
                                chrome_options.add_argument(f'user-agent={self.get_random_user_agent()}')
                                self.driver = webdriver.Chrome(options=chrome_options)
                                print("Browser reset successful")
                                # Try one more time with the new driver
                                self.driver.get(search_url)
                                time.sleep(10)  # Longer wait after reset
                            except Exception as reset_error:
                                print(f"Browser reset failed: {reset_error}")
                                # Skip this date combination if we can't recover
                                raise

                # After successful page load, extract the hotel data
                hotels = self.extract_hotels()

                # Add date information to each hotel
                for hotel in hotels:
                    hotel.update(date_info)

                all_data.extend(hotels)

                # Partial data save
                if len(all_data) % 50 == 0:  # Save more frequently
                    self.save_to_csv(all_data, f'nyc_hotels_data_partial_{len(all_data)}.csv')

                print(f"\nTotal hotels collected: {self.total_hotels_collected}")

                # Add a longer random delay between searches
                delay = random.uniform(5, 10)
                print(f"Waiting {delay:.1f} seconds before next search...")
                time.sleep(delay)

            except Exception as e:
                print(f"Error in search #{i}: {str(e)}")
                # Save data collected so far
                if all_data:
                    self.save_to_csv(all_data, f'nyc_hotels_emergency_save.csv')
                # Wait before trying the next search
                time.sleep(5)

        # Save all data at the end
        self.save_to_csv(all_data, 'nyc_hotels_data_final.csv')
        print(f"\nData collection completed! Total hotels collected: {self.total_hotels_collected}")

    def _build_search_url(self, date_info):
        """Build search URL for New York City"""
        url = f"{self.base_url}/searchresults.html"
        url += f"?dest_id=20088325&dest_type=city"  # New York City
        url += f"&checkin={date_info['check_in']}"
        url += f"&checkout={date_info['check_out']}"
        url += "&group_adults=2&no_rooms=1&group_children=0"
        url += "&selected_currency=USD"  # Set currency to USD for USA
        url += "&order=popularity"  # Sort by popularity
        url += "&nflt=ht_id%3D204"  # Filter for hotels only
        url += "&rows=50"  # Request more results per page
        return url

    def generate_dates(self):
        """Generate dates for search with all required combinations"""
        dates = []
        snapshot_dates = [datetime.now() + timedelta(days=x) for x in [0, 30, 60]]

        # Full set of 450 combinations (3 snapshots × 30 TTT values × 5 LOS values)
        for snapshot in snapshot_dates:
            for ttt in range(1, 31):  # TTT from 1 to 30
                for los in range(1, 6):  # LOS from 1 to 5
                    check_in = snapshot + timedelta(days=ttt)
                    check_out = check_in + timedelta(days=los)
                    dates.append({
                        'snapshot_date': snapshot.strftime('%Y-%m-%d'),
                        'check_in': check_in.strftime('%Y-%m-%d'),
                        'check_out': check_out.strftime('%Y-%m-%d'),
                        'ttt': ttt,
                        'los': los
                    })

        return dates

    def __del__(self):
        """Clean up resources"""
        try:
            if hasattr(self, 'driver'):
                self.driver.quit()
                print("\n Browser closed successfully")
        except:
            pass

if __name__ == "__main__":
    scraper = None
    try:
        # Set up basic exception handling for interrupt signals
        import signal

        def signal_handler(sig, frame):
            print("\nReceived interrupt signal, shutting down gracefully...")
            if scraper and hasattr(scraper, 'driver'):
                try:
                    scraper.driver.quit()
                except:
                    pass
            print("Process terminated by user")
            import sys
            sys.exit(0)

        # Register signal handler for CTRL+C
        signal.signal(signal.SIGINT, signal_handler)

        print("\nStarting Booking.com data collection process for New York hotels...")
        scraper = BookingScraper()
        scraper.run_scraping()
    except KeyboardInterrupt:
        print("\nProcess interrupted by keyboard")
    except Exception as e:
        print(f"\nCritical error: {str(e)}")
        import traceback
        print(traceback.format_exc())
    finally:
        if scraper:
            try:
                scraper.__del__()
            except:
                print("Issues during cleanup")
            print("\nCollection process finished")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Rating: Scored 8.6
8.6
Excellent
356 reviews
Stars: 4 stars
Address: Manhattan, New York
Distance: Distance not available
Room Type: King Studio
Review Count: 356 review
──────────────────────────────────────────────────
Successfully extracted data for hotel #28
Found review count in rating text: 2,693 review
Estimated 3 stars based on numeric rating: 7.4

Hotel #29:
──────────────────────────────────────────────────
Name: TRYP by Wyndham New York City Times Square - Midtown
Price: $890
Rating: Scored 7.4
7.4
Good
2,693 reviews
Stars: 3 stars
Address: Hell's Kitchen, New York
Distance: Distance not available
Room Type: Queen Room
Review Count: 2,693 review
──────────────────────────────────────────────────
Successfully extracted data for hotel #29
Found review count in rating text: 3,611 review
Estimated 3 stars based on numeric rating: 7.8

Hotel #30:
──────────────────────────────────────────────────
Name: Holiday Inn E