In [12]:
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import sqlite3
import os
# Ensure the necessary libraries are installed

In [None]:
def is_related_marker(element):
    """
    Checks if the element is the 'See more related results' marker.
    """
    try:
        return (
            element.get_attribute("class") == "ln-u-margin-top*3 ln-u-display-2"
            and "See more related results" in element.text
        )
    except Exception:
        return False

def is_sponsored(product_element):
    """
    Checks if the product element is sponsored.
    """
    try:
        header = product_element.find_element(By.CSS_SELECTOR, "[data-testid='product-header']")
        return "Sponsored" in header.text
    except NoSuchElementException:
        return False

def is_in_extra_row(product_element):
    """
    Returns True if the product is inside an element with class 'pt-extra-row'.
    """
    try:
        parent = product_element
        while parent:
            if "pt-extra-row" in parent.get_attribute("class").split():
                return True
            parent = parent.find_element(By.XPATH, "..")
    except Exception:
        pass
    return False

def get_items(driver):
    products = []
    try:
        # Wait for the search results to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "pt__link"))
        )
    except TimeoutException:
        print("Search results did not load in time.")
        return products

    product_elements = driver.find_elements(By.CLASS_NAME, "pt")
    # Find the marker for related results
    all_elements = driver.find_elements(By.XPATH, "//*")
    related_marker_found = False
    related_marker_index = None
    for idx, el in enumerate(all_elements):
        if is_related_marker(el):
            related_marker_found = True
            related_marker_index = idx
            break

    # Map product_elements to their position in all_elements
    # This assumes product_elements appear in order in all_elements
    all_elements_ids = [el.id for el in all_elements]
    product_indices = []
    for prod in product_elements:
        try:
            product_indices.append(all_elements_ids.index(prod.id))
        except ValueError:
            product_indices.append(-1)

    for indx, product in enumerate(product_elements):
        try:
            # Skip products in pt-extra-row
            if is_in_extra_row(product):
                continue
            product_name = product.find_element(By.CLASS_NAME, "pt__link").get_attribute("title")
            product_price = product.find_element(By.CLASS_NAME, "pt__cost__retail-price").text
            label = ""
            # Check for Sponsored
            if is_sponsored(product):
                label = "Sponsored"
            # Check for Related
            elif related_marker_found and product_indices[indx] > related_marker_index:
                label = "Related"
            products.append({
                "Product Name": product_name,
                "Price": product_price,
                "Label": label
            })
        except NoSuchElementException as e:
            print(f"Error retrieving product details for index {indx}: {e}")
            continue

    return products 

In [14]:
def reject_cookies(driver):
    try:
        # Wait for the cookie banner to appear and reject cookies - CAN TRY DIFFERENT WAY OF LOCATING HTMLS
        WebDriverWait(driver, 7).until(
            EC.presence_of_element_located((By.ID, "onetrust-reject-all-handler")) # wait for the cookie bannr
            ).click()
    except (TimeoutException, NoSuchElementException):
        print("Cookie banner not found or already rejected.")

In [15]:
def element_exists(driver, by, value, timeout=5):
    try:
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
        return True
    except TimeoutException:
        return False

try to search by id

In [16]:
url =  "https://www.sainsburys.co.uk"

driver = webdriver.Chrome()
driver.get(url)

reject_cookies(driver)
search = driver.find_element(By.ID, "term")
search.clear()  # Clear the search field if needed

query = "pasta sauce"

for char in query:
    search.send_keys(char)  # Type each character into the search field
    time.sleep(0.2)  # Wait for half a second between keystrokes
search.send_keys(Keys.RETURN)   # Press Enter to submit the search

all_items = []

if element_exists(driver, By.CLASS_NAME, "ln-c-pagination__list"):
    num_pages = int(driver.find_element(By.CSS_SELECTOR, "[rel='last'].ln-c-pagination__link").text)
    for i in range(1, num_pages + 1):
        all_items.extend(get_items(driver))
        next_button = driver.find_element(By.CSS_SELECTOR, "[rel='next'].ln-c-pagination__link")
        next_button.click()
        time.sleep(4)
        
else:
    if "0" in driver.find_element(By.CSS_SELECTOR, "[data-testid='search-results-title']").text:
        print("There are not results for this search")
    else:
        all_items.extend(get_items(driver))

all_items_df = pd.DataFrame(all_items, columns=['Product Name', 'Price', 'Label'])

In [18]:
db_path = 'sainsburys_products.db'
conn = sqlite3.connect(db_path)
all_items_df.to_sql('products', conn, if_exists='replace', index=False)

171