In [None]:
#saves the names and image links only - chips
import json
import csv
import time
from datetime import datetime, timezone, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import logging
import os
import random
import atexit
# Removed requests import as it's no longer directly used for image download
# Removed urlsplit import as it was used with requests

# --- Configuration ---
# --- GOOGLE CHROME SPECIFIC PATHS ---
GOOGLE_CHROME_PROFILE_PATH = r"L:\temp\scrapellm\chrome_profile" # The user data directory for Google Chrome profile
GOOGLE_CHROME_DEBUGGING_PORT = 9222 # Standard port for Chrome/Chromium remote debugging

# Output directory for logs, and JSON data
BASE_OUTPUT_DIR = r"D:\Docs\data"
OUTPUT_JSON_FILE = os.path.join(BASE_OUTPUT_DIR, "snacksKSA.json")
LOG_FILE = os.path.join(BASE_OUTPUT_DIR, "automation_log.log")
# We no longer need SCREENSHOT_DIR or IMAGE_SAVE_DIR for saving files,
# but keeping their creation for minimal disruption to existing paths.
# No images will actually be saved to them with this version.
SCREENSHOT_DIR = os.path.join(BASE_OUTPUT_DIR, "screenshots") 
IMAGE_SAVE_DIR = os.path.join(BASE_OUTPUT_DIR, "product_images")


# Create output directories if they don't exist
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
os.makedirs(SCREENSHOT_DIR, exist_ok=True)
os.makedirs(IMAGE_SAVE_DIR, exist_ok=True)


# --- Logging Setup ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

global_driver = None

def cleanup_driver():
    global global_driver
    if global_driver:
        try:
            logger.info("Attempting to close WebDriver gracefully via atexit.")
            global_driver.quit()
            logger.info("WebDriver closed successfully by atexit handler.")
            global_driver = None
        except Exception as e:
            logger.error(f"Error closing WebDriver via atexit handler: {e}")

atexit.register(cleanup_driver)

def initialize_driver():
    """Initializes Selenium WebDriver to connect to an existing Google Chrome instance."""
    global global_driver
    try:
        chrome_options = ChromeOptions()
        
        # Connect to an existing Google Chrome instance via remote debugging port
        chrome_options.add_experimental_option("debuggerAddress", f"localhost:{GOOGLE_CHROME_DEBUGGING_PORT}")
        
        # Specify the user data directory (profile path)
        chrome_options.add_argument(f"--user-data-dir={GOOGLE_CHROME_PROFILE_PATH}")

        # Initialize the Chrome WebDriver
        global_driver = webdriver.Chrome(options=chrome_options)
        
        global_driver.maximize_window()
        time.sleep(random.uniform(1.0, 3.0)) # Give browser a moment to initialize and load
        logger.info("Selenium ChromeDriver (Google Chrome) initialized successfully and connected to existing instance.")
        return global_driver
    except Exception as e:
        logger.critical(f"Failed to initialize Google Chrome WebDriver. Ensure Google Chrome is running with remote debugging enabled (--remote-debugging-port={GOOGLE_CHROME_DEBUGGING_PORT} --user-data-dir=\"{GOOGLE_CHROME_PROFILE_PATH}\"), and no other Chrome instances are using this port or profile. Error: {e}", exc_info=True)
        raise # Re-raise to stop the script if driver can't start

# --- SIMPLIFIED save_image_locally function to ONLY return the image URL ---
def save_image_locally(driver, image_url, product_name):
    """
    This function no longer attempts to download or save images locally.
    It simply returns the provided image_url string for inclusion in the JSON.
    """
    logger.info(f"Using image URL directly for '{product_name}': {image_url}")
    return image_url # Return the URL directly for the JSON output

# --- scrape_carrefour_products function (Copied EXACTLY from your "working code" snippet) ---
def scrape_carrefour_products(driver):
    """
    Scrapes product names and image URLs from the Carrefour page.
    """
    logger.info("Starting product scraping with provided working XPaths and collecting image URLs.")
    
    products_data = []
    
    # --- XPaths from the code you indicated was previously working for initial scrape ---
    main_product_list_container_xpath = "/html/body/main/div/div[2]/div[2]/div/div[2]"
    
    product_row_xpath = f"{main_product_list_container_xpath}//div[contains(@class, 'mb-lg') and contains(@class, 'flex') and contains(@class, 'w-fit') and contains(@class, 'justify-start') and contains(@class, 'gap-4')]"
    
    product_card_in_row_xpath = ".//div[contains(@class, 'relative') and contains(@class, 'w-[134px]') and contains(@class, 'flex') and contains(@class, 'justify-between')]"
    
    try:
        # 1. Wait for the main product list container to be present.
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, main_product_list_container_xpath))
        )
        logger.info(f"Main product list container located using: {main_product_list_container_xpath}")

        # 2. Find all product rows
        logger.info(f"Attempting to find all product rows using XPath: {product_row_xpath}")
        product_rows = WebDriverWait(driver, 30).until( 
            EC.presence_of_all_elements_located((By.XPATH, product_row_xpath))
        )
        logger.info(f"Found {len(product_rows)} product rows on the page.")

        if not product_rows:
            logger.warning("No product rows found. Check the product_row_xpath or if content is dynamically loaded differently.")
            return products_data 

        # 3. Iterate through each row and find product cards within them
        for row_index, row_element in enumerate(product_rows):
            logger.info(f"Processing row {row_index + 1}/{len(product_rows)}")
            
            # Find all individual product cards within the current row
            try:
                cards_in_row = WebDriverWait(row_element, 5).until( 
                    EC.presence_of_all_elements_located((By.XPATH, product_card_in_row_xpath))
                )
                logger.debug(f"Found {len(cards_in_row)} product cards in row {row_index + 1}.")
            except TimeoutException:
                logger.warning(f"No product cards found within row {row_index + 1} after 5 seconds. Skipping this row. Row outerHTML (first 500 chars): {row_element.get_attribute('outerHTML')[:500]}...")
                continue 

            for card_index, card_element in enumerate(cards_in_row):
                product_name = None
                image_url = None
                
                try:
                    # Use a short WebDriverWait for elements *within* the card, as the card element is already present
                    wait_in_card = WebDriverWait(card_element, 2) 
                    
                    logger.debug(f"Scraping product card {card_index + 1} in row {row_index + 1}.")

                    # Product Name:
                    name_element = wait_in_card.until(
                        EC.presence_of_element_located((By.XPATH, ".//div[contains(@class, 'line-clamp-2')]/span"))
                    )
                    product_name = name_element.text.strip()
                    
                    # Product Image:
                    image_element = wait_in_card.until(
                        EC.presence_of_element_located((By.XPATH, ".//div[contains(@class, 'relative')]/a/div/img"))
                    )
                    image_url = image_element.get_attribute("src")

                    if product_name and image_url:
                        logger.info(f"Scraped - Name: '{product_name}', Image URL: '{image_url}' (Row {row_index + 1}, Card {card_index + 1})")
                        # Call save_image_locally which now just returns the URL string
                        final_image_link = save_image_locally(driver, image_url, product_name) 
                        
                        products_data.append({
                            "source": "carrefour",
                            "link": driver.current_url, # This will be the category page URL
                            "category": "chips", 
                            "name": product_name,
                            "image": final_image_link # This will now be the image URL string
                        })
                    else:
                        logger.warning(f"Missing name or image URL for product in Row {row_index + 1}, Card {card_index + 1}. Name: '{product_name}', Image URL: '{image_url}'")

                except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                    logger.warning(f"Element not found or stale for product in Row {row_index + 1}, Card {card_index + 1}. Error: {e}")
                    try: 
                        logger.debug(f"Card {card_index+1} outer HTML (first 500 chars): {card_element.get_attribute('outerHTML')[:500]}...")
                    except Exception as get_html_e:
                        logger.warning(f"Error getting outerHTML for card {card_index+1}: {get_html_e}")
                    continue 
                except Exception as e:
                    logger.error(f"An unexpected error occurred while processing product in Row {row_index + 1}, Card {card_index + 1}: {e}", exc_info=True)
                    continue 

    except (TimeoutException, NoSuchElementException) as e:
        logger.critical(f"CRITICAL: Failed to find main product content area or any product rows. Error: {e}", exc_info=True)
    except Exception as e:
        logger.critical(f"An unhandled critical error occurred during the overall product scraping process: {e}", exc_info=True)
    
    return products_data

def main():
    driver = None
    target_url = "https://www.carrefourksa.com/mafsau/en/c/FKSA1730000?currentPage=0&filter=product_category_level_3_en%3A%27FKSA1730200%27&nextPageOffset=0&pageSize=200&sortBy=relevance"

    try:
        driver = initialize_driver()
        
        if driver.current_url != target_url:
            logger.info(f"Current URL ({driver.current_url}) does not match target. Navigating to: {target_url}")
            driver.get(target_url)
            logger.info(f"Navigated to: {target_url}")
            time.sleep(random.uniform(3, 5)) 
        else:
            logger.info(f"Already on the target URL: {target_url}. Skipping navigation.")
            time.sleep(random.uniform(1, 2))

        logger.info("Starting scroll to ensure all products are loaded...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        max_scroll_attempts = 15 
        while scroll_attempts < max_scroll_attempts:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(2, 4)) 
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                logger.info("Reached bottom of the page (no new content loaded after scroll).")
                break
            last_height = new_height
            scroll_attempts += 1
            logger.info(f"Scrolled. Current height: {new_height}, Scroll attempt: {scroll_attempts}")
        if scroll_attempts == max_scroll_attempts:
            logger.warning(f"Max scroll attempts ({max_scroll_attempts}) reached. Page might not have fully loaded or is infinite.")
        
        time.sleep(random.uniform(1, 2)) 

        scraped_products = scrape_carrefour_products(driver)

        try:
            with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
                json.dump(scraped_products, f, ensure_ascii=False, indent=4)
            logger.info(f"Scraped data saved to {OUTPUT_JSON_FILE}")
            logger.info(f"Total products scraped: {len(scraped_products)}")
        except IOError as e:
            logger.error(f"Error saving data to JSON file: {e}")

    except Exception as e:
        logger.critical(f"Script encountered a critical error in main execution: {e}", exc_info=True)
    finally:
        pass 

if __name__ == "__main__":
    main()