## using similar logic to our sainsbury scraper - we extract product, price, link and category (new label) for our search query

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Helper: reject cookies

def reject_cookies(driver):
    try:
        WebDriverWait(driver, 7).until(
            EC.presence_of_element_located((By.ID, "onetrust-reject-all-handler"))
        ).click()
    except (TimeoutException, NoSuchElementException):
        print("Cookie banner not found or already rejected.")


In [None]:
def scroll_to_bottom(driver, pause=1.5, max_attempts=20):
    last_height = driver.execute_script("return document.body.scrollHeight")
    attempts = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, "sc-filq44-0"))
            )
        except Exception:
            pass
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height or attempts >= max_attempts:
            break
        last_height = new_height
        attempts += 1
    # Small scroll up and down to trigger lazy loading
    driver.execute_script("window.scrollTo(0, 100);")
    time.sleep(1)
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(1)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)


In [None]:
def extract_product_data(element, current_category):
    # Check for category header
    try:
        header_text = element.find_elements(By.XPATH, './/div[@class="outer-header-container"]//span[@data-test="breadcrumb-text"]')
        if header_text and header_text[0].text.strip():
            return None, header_text[0].text.strip()  # Return new category
    except Exception:
        pass
    # title
    try:
        title_el = element.find_elements(By.XPATH, './/h3[@data-test="fop-title"]')
        title = title_el[0].text.strip() if title_el else None
    except Exception:
        title = None
    # link
    try:
        link_el = element.find_elements(By.XPATH, './/a[@data-test="fop-product-link"]')
        link = link_el[0].get_attribute('href') if link_el else None
    except Exception:
        link = None
    # price
    try:
        price_el = element.find_elements(By.XPATH, './/span[@data-test="fop-price"]')
        price = price_el[0].text.strip() if price_el else None
    except Exception:
        price = None
    if title and price:
        return {
            'category': current_category,
            'title': title,
            'price': price,
            'link': link
        }, current_category
    else:
        return None, current_category


In [None]:
# Start URL for Morrisons search (edit as needed)
start_url = "https://groceries.morrisons.com/search?q=bread"

driver = webdriver.Chrome()
driver.get(start_url)
reject_cookies(driver)
scroll_to_bottom(driver)

product_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "sc-filq44-0")]')

all_products = []
current_category = "No Category"
for element in product_elements:
    product, new_category = extract_product_data(element, current_category)
    if new_category != current_category:
        current_category = new_category
        continue  # skip category header elements
    if product:
        all_products.append(product)

driver.quit()

# Convert to DataFrame
morrisons_df = pd.DataFrame(all_products, columns=["category", "title", "price", "link"])


In [None]:
morrisons_df