In [7]:
%pip install pandas numpy selenium webdriver-manager openpyxl


Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import os
import json
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager


The EnhancedExpediaScraper class automates hotel data extraction from Expedia using Selenium while mimicking human behavior to avoid detection. It includes randomized user agents, human-like scrolling and delays, CAPTCHA detection, and session cookies. The scraper iterates through check-in/out dates, loads hotels dynamically, extracts key details (name, rating, price, location), and saves the data to a CSV file. It also handles bot prevention measures to ensure smooth data collection.

In [None]:
class EnhancedExpediaScraper:
    def __init__(self):
        self.base_url = "https://www.expedia.com"
        self.user_agents = [
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        ]
        self.setup_chrome_options()
        
    def setup_chrome_options(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--start-maximized")
        self.chrome_options.add_argument("--window-size=1920,1080")
        self.chrome_options.add_argument("--disable-notifications")
        self.chrome_options.add_argument("--disable-infobars")

        
        # Use a random user agent
        user_agent = random.choice(self.user_agents)
        self.chrome_options.add_argument(f"--user-agent={user_agent}")
        
        # Avoid detection
        self.chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        self.chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Add preferences to appear more human-like
        prefs = {
            "profile.default_content_setting_values.notifications": 2,
            "credentials_enable_service": False,
            "profile.password_manager_enabled": False
        }
        self.chrome_options.add_experimental_option("prefs", prefs)

    def start_driver(self):
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=self.chrome_options)
        
        # Mask webdriver to avoid detection
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        # Additional masking
        driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]})")
        driver.execute_script("Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']})")
        
        return driver
    
    def human_like_delay(self, min_seconds=1, max_seconds=5):
        """Add a random delay to simulate human behavior."""
        delay = random.uniform(min_seconds, max_seconds)
        time.sleep(delay)
        return delay
    
    def human_like_scroll(self, driver, scroll_count=5):
        """Scroll in a human-like pattern."""
        actions = ActionChains(driver)
        
        # Random initial scroll position
        initial_scroll = random.randint(300, 700)
        driver.execute_script(f"window.scrollTo(0, {initial_scroll});")
        self.human_like_delay(0.5, 2)
        
        # Perform random scrolls
        for _ in range(scroll_count):
            # Random scroll amount
            scroll_amount = random.randint(200, 800)
            
            # Occasionally scroll up instead of down
            if random.random() < 0.2:
                scroll_amount = -scroll_amount
                
            # Execute scroll
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            
            # Random pause between scrolls
            self.human_like_delay(0.5, 2)
            
            # Occasionally move the mouse
            if random.random() < 0.3:
                x = random.randint(100, 1000)
                y = random.randint(100, 600)
                actions.move_by_offset(x, y).perform()
                self.human_like_delay(0.2, 1)
    
    def is_captcha_page(self, driver):
        """Check if the current page is a CAPTCHA challenge page."""
        try:
            # Look for common CAPTCHA indicators
            captcha_indicators = [
                "captcha", "bot check", "human verification", 
                "security check", "show us your human side", 
                "bot or not", "verify you're human"
            ]
            
            page_source = driver.page_source.lower()
            page_title = driver.title.lower()
            
            for indicator in captcha_indicators:
                if indicator in page_source or indicator in page_title:
                    print(f"CAPTCHA detected: '{indicator}' found on page")
                    return True
                    
            # Check for specific CAPTCHA elements
            captcha_elements = driver.find_elements(By.CSS_SELECTOR, 
                "#ARKOSE-CHALLENGE, #DATADOME-CHALLENGE, #captcha, .captcha, .g-recaptcha")
            
            if captcha_elements:
                print(f"CAPTCHA element detected: {len(captcha_elements)} elements found")
                return True
                
            return False
        except Exception as e:
            print(f"Error checking for CAPTCHA: {e}")
            return False
    
    def handle_captcha(self, driver):
        """Handle CAPTCHA detection with guidance."""
        print("\n=== CAPTCHA DETECTED ===")
        print("The website has detected automated access and is showing a CAPTCHA challenge.")
        
        # Save a screenshot for manual inspection
        try:
            driver.save_screenshot("expedia_captcha.png")
            print("Screenshot saved as 'expedia_captcha.png'")
        except Exception as e:
            print(f"Error saving screenshot: {e}")
        
        # Prompt for manual CAPTCHA solving
        print("\nWould you like to manually solve the CAPTCHA? (y/n)")
        user_input = input()
        
        if user_input.lower() == 'y':
            print("Please solve the CAPTCHA in the browser window.")
            print("Press Enter when you've completed the CAPTCHA...")
            input()
            
            # Check if CAPTCHA is still present
            if self.is_captcha_page(driver):
                print("CAPTCHA is still present. Unable to proceed.")
                return False
            else:
                print("CAPTCHA solved successfully!")
                return True
        else:
            print("Skipping manual CAPTCHA solving.")
            return False
    
    def wait_for_element(self, driver, by, value, timeout=10):
        try:
            element = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((by, value))
            )
            return element
        except TimeoutException:
            print(f"Timeout waiting for element: {value}")
            return None

    def get_ny_hotels_url(self, checkin_date, checkout_date):
        # Format dates for Expedia URL (MM/DD/YYYY)
        checkin_formatted = datetime.datetime.strptime(checkin_date, "%Y-%m-%d").strftime("%m/%d/%Y")
        checkout_formatted = datetime.datetime.strptime(checkout_date, "%Y-%m-%d").strftime("%m/%d/%Y")
        
        # Construct the URL for New York hotels
        url = f"{self.base_url}/Hotel-Search?destination=New%20York,%20New%20York,%20United%20States%20of%20America&startDate={checkin_formatted}&endDate={checkout_formatted}&rooms=1&adults=2&children=0"
        return url

    def accept_cookies(self, driver):
        """Accept cookies if prompted."""
        try:
            cookie_buttons = driver.find_elements(By.CSS_SELECTOR, "button#onetrust-accept-btn-handler, button[data-stid='button-accept']")
            if cookie_buttons:
                # Add a human-like delay before clicking
                self.human_like_delay(1, 3)
                
                # Move mouse to the button before clicking
                actions = ActionChains(driver)
                actions.move_to_element(cookie_buttons[0]).perform()
                self.human_like_delay(0.5, 1.5)
                
                cookie_buttons[0].click()
                print("Accepted cookies")
                self.human_like_delay(1, 2)
            else:
                print("No cookie banner found or already accepted")
        except Exception as e:
            print(f"Error accepting cookies: {e}")
    
    def scroll_to_load_hotels(self, driver, min_hotels=100, max_scrolls=30):
        """
        Scroll down the page to load more hotels.
        """
        print(f"Scrolling to load at least {min_hotels} hotels...")
        
        # Initial count of hotels
        hotels = driver.find_elements(By.CSS_SELECTOR, "div[data-stid='property-listing'], li[data-stid='property-listing']")
        if not hotels:
            hotels = driver.find_elements(By.CSS_SELECTOR, "div.uitk-card:has(h3), div.uitk-card:has(div[data-stid='price-lockup'])")
        
        initial_count = len(hotels)
        print(f"Initial hotel count: {initial_count}")
        
        # Scroll until we have enough hotels or reach max_scrolls
        scroll_count = 0
        last_count = initial_count
        consecutive_no_change = 0
        
        while len(hotels) < min_hotels and scroll_count < max_scrolls:
            # Perform a human-like scroll
            self.human_like_scroll(driver, scroll_count=3)
            
            # Add a longer delay every few scrolls to allow content to load
            if scroll_count % 3 == 0:
                self.human_like_delay(3, 5)
            else:
                self.human_like_delay(1, 3)
            
            # Check if we need to click "Show more" button
            try:
                show_more_buttons = driver.find_elements(By.CSS_SELECTOR, 
                    "button.uitk-button, button.uitk-card-link, a.uitk-card-link")
                
                for button in show_more_buttons:
                    if button.is_displayed() and button.is_enabled():
                        button_text = button.text.lower()
                        if "show more" in button_text or "load more" in button_text or "next" in button_text:
                            print(f"Clicking '{button.text}' button...")
                            button.click()
                            self.human_like_delay(3, 5)
                            break
            except Exception as e:
                print(f"Error clicking 'Show more' button: {e}")
            
            # Re-count hotels
            hotels = driver.find_elements(By.CSS_SELECTOR, "div[data-stid='property-listing'], li[data-stid='property-listing']")
            if not hotels:
                hotels = driver.find_elements(By.CSS_SELECTOR, "div.uitk-card:has(h3), div.uitk-card:has(div[data-stid='price-lockup'])")
            
            current_count = len(hotels)
            print(f"Scroll {scroll_count+1}/{max_scrolls}: Found {current_count} hotels")
            
            # Check if the count has changed
            if current_count == last_count:
                consecutive_no_change += 1
                # If no change for 5 consecutive scrolls, try a different approach
                if consecutive_no_change >= 5:
                    print("No new hotels loaded after multiple scrolls. Trying different approach...")
                    
                    # Try scrolling to the bottom of the page
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    self.human_like_delay(3, 5)
                    
                    # Try clicking any "Next page" buttons
                    try:
                        next_buttons = driver.find_elements(By.CSS_SELECTOR, 
                            "button[data-stid='show-more-results'], button.pagination-next, a.pagination-next, nav button, nav a")
                        
                        for button in next_buttons:
                            if button.is_displayed() and button.is_enabled():
                                button_text = button.text.lower()
                                if "next" in button_text or "more" in button_text or ">" in button_text:
                                    print(f"Clicking '{button.text}' button...")
                                    button.click()
                                    self.human_like_delay(5, 8)
                                    break
                    except Exception as e:
                        print(f"Error clicking 'Next page' button: {e}")
                    
                    # Reset counter if we've tried alternative approaches
                    consecutive_no_change = 0
            else:
                consecutive_no_change = 0
            
            last_count = current_count
            scroll_count += 1
        
        # Final count
        hotels = driver.find_elements(By.CSS_SELECTOR, "div[data-stid='property-listing'], li[data-stid='property-listing']")
        if not hotels:
            hotels = driver.find_elements(By.CSS_SELECTOR, "div.uitk-card:has(h3), div.uitk-card:has(div[data-stid='price-lockup'])")
        
        final_count = len(hotels)
        print(f"Finished scrolling. Loaded {final_count} hotels (target: {min_hotels})")
        
        return final_count

    def extract_hotel_data(self, hotel_element, snapshot_date, ttt, los, checkin_date, checkout_date):
        hotel_data = {
            "snapshot_date": snapshot_date,
            "ttt": ttt,
            "los": los,
            "checkin_date": checkin_date,
            "checkout_date": checkout_date
        }
        
        try:
            # Hotel name
            try:
                name_element = hotel_element.find_element(By.CSS_SELECTOR, "h3.uitk-heading, h2.uitk-heading, div.uitk-heading")
                hotel_data["name"] = name_element.text
            except:
                hotel_data["name"] = "N/A"
            
            # Hotel stars
            try:
                # Method 1: Look for star rating elements
                stars_elements = hotel_element.find_elements(By.CSS_SELECTOR, "span.uitk-rating-star, span.uitk-star-rating, span[data-stid='content-hotel-stars']")
                if stars_elements:
                    hotel_data["stars"] = len(stars_elements)
                else:
                    # Method 2: Look for text with star rating
                    star_text_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text, span.uitk-text, div[data-stid='content-hotel-badge']")
                    for element in star_text_elements:
                        text = element.text.lower()
                        if "star" in text and any(str(i) in text for i in range(1, 6)):
                            # Extract the number from text like "4-star hotel"
                            match = re.search(r'(\d+)[- ]star', text)
                            if match:
                                hotel_data["stars"] = int(match.group(1))
                                break
                
                # Method 3: Look for aria-label attributes that might contain star information
                if "stars" not in hotel_data or not hotel_data["stars"]:
                    elements_with_aria = hotel_element.find_elements(By.CSS_SELECTOR, "[aria-label]")
                    for element in elements_with_aria:
                        aria_text = element.get_attribute("aria-label").lower()
                        if "star" in aria_text and any(str(i) in aria_text for i in range(1, 6)):
                            match = re.search(r'(\d+)[- ]star', aria_text)
                            if match:
                                hotel_data["stars"] = int(match.group(1))
                                break
            except:
                hotel_data["stars"] = None
            
            # Price
            try:
                # Method 1: Look for price elements with specific data attributes
                price_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div[data-stid='price-lockup'], span[data-stid='price-lockup']")
                if price_elements:
                    price_text = price_elements[0].text
                    # Extract price using regex
                    price_match = re.search(r'\$(\d+(?:,\d+)?(?:\.\d+)?)', price_text)
                    if price_match:
                        price_str = price_match.group(1).replace(',', '')
                        hotel_data["price"] = float(price_str)
                else:
                    # Method 2: Look for any element containing a dollar sign
                    all_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text, span.uitk-text, div.uitk-price-lockup")
                    for element in all_elements:
                        text = element.text
                        if '$' in text:
                            price_match = re.search(r'\$(\d+(?:,\d+)?(?:\.\d+)?)', text)
                            if price_match:
                                price_str = price_match.group(1).replace(',', '')
                                hotel_data["price"] = float(price_str)
                                break
            except:
                hotel_data["price"] = None
            
            # Rating score
            try:
                # Method 1: Look for review score elements
                rating_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div[data-stid='property-review-score'] span.uitk-text-emphasis-theme, span[data-stid='content-hotel-reviews-rating']")
                if rating_elements:
                    score_text = rating_elements[0].text
                    if score_text and score_text.replace('.', '', 1).isdigit():
                        hotel_data["rating_score"] = float(score_text)
                else:
                    # Method 2: Look for any element that might contain a rating score (typically a number out of 10 or 5)
                    all_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text, span.uitk-text, div.uitk-rating")
                    for element in all_elements:
                        text = element.text
                        if text and text.replace('.', '', 1).isdigit() and len(text) <= 4:
                            score = float(text)
                            if 0 <= score <= 10:  # Validate it's likely a rating
                                hotel_data["rating_score"] = score
                                break
                
                # Method 3: Check for aria-label attributes that might contain rating information
                if "rating_score" not in hotel_data or not hotel_data["rating_score"]:
                    elements_with_aria = hotel_element.find_elements(By.CSS_SELECTOR, "[aria-label]")
                    for element in elements_with_aria:
                        aria_text = element.get_attribute("aria-label").lower()
                        if "rating" in aria_text or "score" in aria_text or "out of" in aria_text:
                            # Look for patterns like "4.2 out of 5" or "rating: 8.5"
                            match = re.search(r'(\d+\.\d+)(?:\s*\/|\s+out\s+of\s+)?(?:\s*\d+)?', aria_text)
                            if match:
                                hotel_data["rating_score"] = float(match.group(1))
                                break
            except:
                hotel_data["rating_score"] = None
            
            # Rating description (good/excellent)
            try:
                # Look for elements that might contain rating descriptions
                rating_desc_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div[data-stid='property-review-score'] span:not(.uitk-text-emphasis-theme), span[data-stid='content-hotel-reviews-qualifier']")
                if rating_desc_elements:
                    for element in rating_desc_elements:
                        text = element.text.lower()
                        if any(word in text for word in ["good", "excellent", "fair", "poor", "wonderful", "very good"]):
                            hotel_data["rating_description"] = element.text
                            break
                
                # If not found, try a broader search
                if "rating_description" not in hotel_data or not hotel_data["rating_description"]:
                    all_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text, span.uitk-text, div.uitk-rating-qualifier")
                    for element in all_elements:
                        text = element.text.lower()
                        if any(word in text for word in ["good", "excellent", "fair", "poor", "wonderful", "very good"]):
                            hotel_data["rating_description"] = element.text
                            break
            except:
                hotel_data["rating_description"] = None
            
            # Location description
            try:
                # Method 1: Look for location elements with specific data attributes
                location_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div[data-stid='content-hotel-neighborhood'], span[data-stid='content-hotel-neighborhood']")
                if location_elements:
                    hotel_data["location"] = location_elements[0].text
                else:
                    # Method 2: Look for elements that might contain location information
                    location_candidates = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text.uitk-type-300, span.uitk-text, div.uitk-text-default-theme")
                    for element in location_candidates:
                        text = element.text.lower()
                        # Location often mentions distance, area names, or landmarks
                        if any(word in text for word in ["miles from", "km from", "downtown", "center", "near", "located in", "manhattan", "brooklyn", "queens", "bronx", "staten island"]):
                            hotel_data["location"] = element.text
                            break
                
                # Method 3: Check for aria-label attributes that might contain location information
                if "location" not in hotel_data or not hotel_data["location"]:
                    elements_with_aria = hotel_element.find_elements(By.CSS_SELECTOR, "[aria-label]")
                    for element in elements_with_aria:
                        aria_text = element.get_attribute("aria-label").lower()
                        if "location" in aria_text or "situated" in aria_text or "miles from" in aria_text:
                            hotel_data["location"] = aria_text
                            break
            except:
                hotel_data["location"] = None
            
            # Cancellation policy
            try:
                # Look for elements that might contain cancellation policy information
                all_elements = hotel_element.find_elements(By.CSS_SELECTOR, "div.uitk-text, span.uitk-text, div.uitk-badge, div[data-stid='content-hotel-cancellation']")
                for element in all_elements:
                    text = element.text.lower()
                    if "refundable" in text or "free cancellation" in text or "cancel for free" in text:
                        hotel_data["cancellation_policy"] = element.text
                        break
            except:
                hotel_data["cancellation_policy"] = None

            return hotel_data
            
        except Exception as e:
            print(f"Error extracting hotel data: {e}")
            return hotel_data

    def visit_homepage_first(self, driver):
        """Visit the homepage first to appear more like a regular user."""
        try:
            print("Visiting Expedia homepage first...")
            driver.get(self.base_url)
            
            # Accept cookies if prompted
            self.accept_cookies(driver)
            
            # Simulate human browsing behavior
            self.human_like_delay(2, 5)
            self.human_like_scroll(driver, random.randint(2, 4))
            
            # Click on random elements occasionally
            try:
                nav_elements = driver.find_elements(By.CSS_SELECTOR, "nav a, .uitk-tab-anchor")
                if nav_elements and random.random() < 0.5:
                    random_element = random.choice(nav_elements)
                    actions = ActionChains(driver)
                    actions.move_to_element(random_element).perform()
                    self.human_like_delay(0.5, 1.5)

            except:
                pass
            
            print("Finished visiting homepage")
            return True
        except Exception as e:
            print(f"Error visiting homepage: {e}")
            return False

    def save_cookies(self, driver, filename="expedia_cookies.json"):
        """Save cookies for future sessions."""
        try:
            cookies = driver.get_cookies()
            with open(filename, "w") as f:
                json.dump(cookies, f)
            print(f"Cookies saved to {filename}")
        except Exception as e:
            print(f"Error saving cookies: {e}")

    def load_cookies(self, driver, filename="expedia_cookies.json"):
        """Load cookies from a previous session."""
        try:
            if os.path.exists(filename):
                with open(filename, "r") as f:
                    cookies = json.load(f)
                
                # Visit the domain first
                driver.get(self.base_url)
                
                # Add the cookies
                for cookie in cookies:
                    try:
                        driver.add_cookie(cookie)
                    except:
                        pass
                
                print(f"Cookies loaded from {filename}")
                return True
            else:
                print(f"Cookie file {filename} not found")
                return False
        except Exception as e:
            print(f"Error loading cookies: {e}")
            return False

    def scrape_expedia(self, ttt_values=range(1, 31), los_values=range(1, 6), snapshot_dates=None, target_hotels_per_date=150):

        if snapshot_dates is None:
            # If no snapshot dates provided, use today
            snapshot_dates = [datetime.datetime.now().strftime("%Y-%m-%d")]
        
        all_data = []
        hotels_per_date = {date: 0 for date in snapshot_dates}
        driver = self.start_driver()
        
        try:
            # First, try to load cookies from a previous session
            self.load_cookies(driver)
            
            # Visit the homepage first to appear more like a regular user
            self.visit_homepage_first(driver)
            
            # Save cookies after visiting homepage
            self.save_cookies(driver)
            
            for snapshot_date in snapshot_dates:
                # Skip if we already have enough hotels for this date
                if hotels_per_date[snapshot_date] >= target_hotels_per_date:
                    print(f"Already collected {hotels_per_date[snapshot_date]} hotels for {snapshot_date}, skipping...")
                    continue
                    
                print(f"\nScraping for snapshot date: {snapshot_date}")
                print(f"Current hotel count: {hotels_per_date[snapshot_date]}/{target_hotels_per_date}")
                
                for ttt in ttt_values:
                    # Skip if we already have enough hotels for this date
                    if hotels_per_date[snapshot_date] >= target_hotels_per_date:
                        print(f"Reached target of {target_hotels_per_date} hotels for {snapshot_date}, moving to next date...")
                        break
                        
                    for los in los_values:
                        # Skip if we already have enough hotels for this date
                        if hotels_per_date[snapshot_date] >= target_hotels_per_date:
                            break
                            
                        try:
                            # Calculate check-in and check-out dates
                            snapshot_date_obj = datetime.datetime.strptime(snapshot_date, "%Y-%m-%d")
                            checkin_date_obj = snapshot_date_obj + datetime.timedelta(days=ttt)
                            checkout_date_obj = checkin_date_obj + datetime.timedelta(days=los)
                            
                            checkin_date = checkin_date_obj.strftime("%Y-%m-%d")
                            checkout_date = checkout_date_obj.strftime("%Y-%m-%d")
                            
                            # Construct URL and navigate to it
                            url = self.get_ny_hotels_url(checkin_date, checkout_date)
                            print(f"Navigating to: {url}")
                            driver.get(url)
                            
                            # Check for CAPTCHA
                            if self.is_captcha_page(driver):
                                captcha_solved = self.handle_captcha(driver)
                                if not captcha_solved:
                                    print("Stopping scraping due to CAPTCHA")
                                    return all_data
                            
                            # Accept cookies if prompted
                            self.accept_cookies(driver)
                            
                            # Wait for the search results to load with human-like delay
                            self.human_like_delay(3, 7)
                            
                            # Calculate how many more hotels we need for this date
                            hotels_needed = target_hotels_per_date - hotels_per_date[snapshot_date]
                            
                            # Scroll to load more hotels in a human-like manner
                            # Target slightly more than we need to account for duplicates
                            hotels_loaded = self.scroll_to_load_hotels(driver, min_hotels=min(150, hotels_needed + 20))
                            
                            # Extract hotel data
                            hotels = driver.find_elements(By.CSS_SELECTOR, "div[data-stid='property-listing'], li[data-stid='property-listing']")
                            
                            if not hotels:
                                print("No hotels found, trying alternative selectors...")
                                hotels = driver.find_elements(By.CSS_SELECTOR, "div.uitk-card:has(h3), div.uitk-card:has(div[data-stid='price-lockup'])")
                            
                            print(f"Found {len(hotels)} hotels")
                            
                            # Limit the number of hotels to process to avoid exceeding our target
                            hotels_to_process = hotels[:min(len(hotels), hotels_needed + 10)]
                            
                            for hotel in hotels_to_process:
                                # Skip if we already have enough hotels for this date
                                if hotels_per_date[snapshot_date] >= target_hotels_per_date:
                                    break
                                    
                                hotel_data = self.extract_hotel_data(
                                    hotel,
                                    snapshot_date,
                                    ttt,
                                    los,
                                    checkin_date,
                                    checkout_date
                                )
                                
                                # Add to our results and update counter
                                all_data.append(hotel_data)
                                hotels_per_date[snapshot_date] += 1
                                
                                # Print progress every 10 hotels
                                if hotels_per_date[snapshot_date] % 10 == 0:
                                    print(f"Progress: {hotels_per_date[snapshot_date]}/{target_hotels_per_date} hotels for {snapshot_date}")
                            
                            # Save the current results to CSV after each search
                            if all_data:
                                temp_df = pd.DataFrame(all_data)
                                temp_csv = f"expedia_hotels_partial_{snapshot_date}.csv"
                                temp_df.to_csv(temp_csv, index=False)
                                print(f"Partial results saved to {temp_csv}")
                            
                            # If we've reached our target for this date, break out of the los loop
                            if hotels_per_date[snapshot_date] >= target_hotels_per_date:
                                print(f"Reached target of {target_hotels_per_date} hotels for {snapshot_date}")
                                break
                            
                            # Human-like delay between requests to avoid detection
                            self.human_like_delay(5, 15)
                            
                        except Exception as e:
                            print(f"Error scraping TTT={ttt}, LOS={los}: {e}")
                            # Continue with the next combination
                
                # Print progress after completing a snapshot date
                print(f"Completed scraping for {snapshot_date}: {hotels_per_date[snapshot_date]}/{target_hotels_per_date} hotels")
                
        except Exception as e:
            print(f"Error during scraping: {e}")
        
        finally:
            # Save cookies before quitting
            self.save_cookies(driver)
            driver.quit()
        
        # Print final summary
        print("\nFinal results summary:")
        for date, count in hotels_per_date.items():
            print(f"- {date}: {count}/{target_hotels_per_date} hotels")
        
        return all_data

In [7]:
%pip install --upgrade pandas openpyxl xlrd



Note: you may need to restart the kernel to use updated packages.


This code reads multiple CSV files from a selected folder, merges the content of all the files into a single DataFrame, and then saves the result to an Excel file. First, the code checks if there are any CSV files in the specified folder. If none are found, it displays a message and stops. Then, each CSV file is read into a DataFrame, and all the DataFrames are stored in a list. Finally, they are merged into a single DataFrame using the pd.concat function. The code ends by saving the result to a new Excel file without displaying the row index.

In [16]:
import os
import pandas as pd

def merge_csv_to_excel(folder_path, output_file):
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    if not all_files:
        print("No CSV files found in the directory.")
        return
    
    dataframes = []
    for file in all_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)
    
    # Merge all dataframes on common columns
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to Excel
    merged_df.to_excel(output_file, index=False)
    print(f"Merged CSV files saved to {output_file}")

# Example usage
folder_path = r"C:\Users\Noa\OneDrive\Desktop\ex"  # עדכן עם הנתיב לתיקייה שלך
output_file = "merged_output.xlsx"  # שם קובץ הפלט
merge_csv_to_excel(folder_path, output_file)

Merged CSV files saved to merged_output.xlsx


The code performs human-like scrolling with random mouse movements and delays between actions to simulate natural user behavior. It scrolls the page randomly in different directions, occasionally changes the scroll direction, and moves the mouse to different areas on the screen. Each action is followed by a random delay to avoid appearing "robotic."

In [None]:
def human_like_scroll(self, driver, scroll_count=5):
    """ Scroll in a human-like pattern while ensuring safe cursor movement """

    # Initialize ActionChains object for simulating mouse movements
    actions = ActionChains(driver)

    # Get the current window size to ensure safe cursor movement
    window_size = driver.get_window_size()
    max_width, max_height = window_size["width"], window_size["height"]

    # Perform an initial scroll with a random value between 300 and 700 pixels
    initial_scroll = random.randint(300, 700)
    driver.execute_script(f"window.scrollTo(0, {initial_scroll});")
    self.human_like_delay(0.5, 2)

    # Perform scrolling for the specified number of times (scroll_count)
    for _ in range(scroll_count):
        scroll_amount = random.randint(200, 800)

        # Occasionally (20% chance), reverse the direction of the scroll
        if random.random() < 0.2:
            scroll_amount = -scroll_amount
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        self.human_like_delay(0.5, 2)

        # Occasionally (30% chance), simulate random cursor movement for more human-like behavior
        if random.random() < 0.3:
            safe_x = random.randint(50, max_width - 50)
            safe_y = random.randint(50, max_height - 50)

            # Move the cursor to the chosen safe position
            actions.move_by_offset(safe_x, safe_y).perform()

            self.human_like_delay(0.2, 1)

In [None]:
import pandas as pd
import os
import datetime


def log_search_progress(search_count, total_searches, snapshot_date, ttt, los, checkin_date, checkout_date):
    # Prints progress of the search including snapshot date, TTT, LOS, and check-in/check-out dates
    print("=" * 60)
    print(f"[{search_count}/{total_searches}] Searching...")
    print(f"Snapshot Date: {snapshot_date}")
    print(f"TTT (Days Before Check-in): {ttt}")
    print(f"LOS (Nights of Stay): {los}")
    print(f"Check-in Date: {checkin_date}")
    print(f"Check-out Date: {checkout_date}")
    print("=" * 60)

scraper = EnhancedExpediaScraper()  # Initialize the scraper object

print("\nStarting Enhanced Expedia Scraper for specific dates...\n")

# Define the list of snapshot dates to scrape
snapshot_dates = [
    "2025-03-17",
    "2025-03-18",
    "2025-03-19"
]

ttt_values = range(1, 31)
los_values = range(1, 6)

# Print the parameters used for scraping
print(f"Scraping with parameters:")
print(f"- Snapshot dates: {snapshot_dates}")
print(f"- TTT values: {list(ttt_values)}")
print(f"- LOS values: {list(los_values)}")

# Define the target number of hotels to scrape per date and calculate the total number of searches
target_hotels_per_date = 100
total_searches = len(snapshot_dates) * len(ttt_values) * len(los_values)  # 3 * 30 * 5 = 450
search_count = 0
all_results = []
all_searches = []

try:
    # Loop through each snapshot date
    for snapshot_date in snapshot_dates:
        print(f"\nStarting searches for snapshot date: {snapshot_date}\n")

        # Loop through each TTT and LOS value to generate different combinations
        for ttt in ttt_values:
            for los in los_values:
                search_count += 1

                # Calculate check-in and check-out dates based on TTT and LOS
                checkin_date = pd.to_datetime(snapshot_date) + pd.Timedelta(days=ttt)
                checkout_date = checkin_date + pd.Timedelta(days=los)
                checkin_date_str = checkin_date.strftime("%Y-%m-%d")
                checkout_date_str = checkout_date.strftime("%Y-%m-%d")

                # Generate a filename for storing the results of this search
                partial_filename = f"expedia_scraped_hotels_{snapshot_date}_TTT{ttt}_LOS{los}.csv"
                if os.path.exists(partial_filename):
                    print(f"Skipping search for {snapshot_date}, TTT={ttt}, LOS={los} (already exists)")
                    continue

                log_search_progress(search_count, total_searches, snapshot_date, ttt, los, checkin_date_str, checkout_date_str)

                try:
                    # Call the scraper to get results for this set of parameters
                    partial_results = scraper.scrape_expedia(
                        ttt_values=[ttt],
                        los_values=[los],
                        snapshot_dates=[snapshot_date],
                        target_hotels_per_date=target_hotels_per_date
                    )

                    # If no results are found, log and save an empty record
                    if not partial_results:
                        print(f"No results found for {snapshot_date}, TTT={ttt}, LOS={los}. Saving empty record.")

                    if not partial_results:
                        partial_results = [{
                            "snapshot_date": snapshot_date,
                            "ttt": ttt,
                            "los": los,
                            "checkin_date": checkin_date_str,
                            "checkout_date": checkout_date_str,
                            "name": None,
                            "price": None,
                            "stars": None,
                            "rating_score": None,
                            "location": None
                        }]

                    # Save the partial results to a CSV file
                    df_partial = pd.DataFrame(partial_results)
                    df_partial.to_csv(partial_filename, index=False, encoding="utf-8")
                    print(f"Saved results in {partial_filename}")

                    # Append the results to the list of all results and individual search results
                    all_results.extend(partial_results)
                    all_searches.append(partial_results[0])

                except Exception as e:
                    print(f"Error during search for {snapshot_date}, TTT={ttt}, LOS={los}: {e}")

                    # If an error occurs, log the error and save an error record
                    error_result = {
                        "snapshot_date": snapshot_date,
                        "ttt": ttt,
                        "los": los,
                        "checkin_date": checkin_date_str,
                        "checkout_date": checkout_date_str,
                        "name": "ERROR",
                        "price": None,
                        "stars": None,
                        "rating_score": None,
                        "location": None
                    }
                    all_searches.append(error_result)

                    # Save the error record to a CSV file
                    error_filename = f"expedia_scraped_hotels_{snapshot_date}_TTT{ttt}_LOS{los}_error.csv"
                    pd.DataFrame([error_result]).to_csv(error_filename, index=False)
                    print(f"Saved error record: {error_filename}")

    # Save all combined results to a final Excel file
    df_all = pd.DataFrame(all_searches)
    final_filename = "expedia_scraped_hotels_combined.xlsx"
    df_all.to_excel(final_filename, index=False, engine="openpyxl")
    print(f"\nFinal combined results saved to {final_filename}")

except Exception as e:
    print(f"Error during scraping: {e}")
    print("Using sample data generator instead...")

# Display the first few rows of the combined results
df_all.head()


Starting Enhanced Expedia Scraper for specific dates...

Scraping with parameters:
- Snapshot dates: ['2025-03-17', '2025-03-18', '2025-03-19']
- TTT values: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
- LOS values: [1, 2, 3, 4, 5]

Starting searches for snapshot date: 2025-03-17

Skipping search for 2025-03-17, TTT=1, LOS=1 (already exists)
Skipping search for 2025-03-17, TTT=1, LOS=2 (already exists)
Skipping search for 2025-03-17, TTT=1, LOS=3 (already exists)
Skipping search for 2025-03-17, TTT=1, LOS=4 (already exists)
Skipping search for 2025-03-17, TTT=1, LOS=5 (already exists)
Skipping search for 2025-03-17, TTT=2, LOS=1 (already exists)
Skipping search for 2025-03-17, TTT=2, LOS=2 (already exists)
Skipping search for 2025-03-17, TTT=2, LOS=3 (already exists)
Skipping search for 2025-03-17, TTT=2, LOS=4 (already exists)
Skipping search for 2025-03-17, TTT=2, LOS=5 (already exists)
Skipping search for 2025-03-1