## trying selenium to get product data from sainsbury

In [1]:
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import sqlite3
import os
from fake_useragent import UserAgent
# Ensure the necessary libraries are installed

In [2]:
# this function checks if the element is a related marker - sainsbury gives some results which aren't exactly the same as the search term
# returns true if the element is a related marker, false otherwise

def is_related_marker(element):
    try:
        return (
            element.get_attribute("class") == "ln-u-margin-top*3 ln-u-display-2"
            and "See more related results" in element.text
        )
    except Exception:
        return False

In [3]:
# this function checks if a product is sponsored by looking for a specific header
# returns true if the product is sponsored, false otherwise

def is_sponsored(product_element):
    try:
        header = product_element.find_element(By.CSS_SELECTOR, "[data-testid='product-header']")
        return "Sponsored" in header.text
    except NoSuchElementException:
        return False

In [4]:
# this function checks if a product is in an extra row by moving up the DOM tree
# returns true if the parent class is pt-extra-row, false otherwise

def is_in_extra_row(product_element):
    try:
        parent = product_element
        while parent:
            if "pt-extra-row" in parent.get_attribute("class").split():
                return True
            parent = parent.find_element(By.XPATH, "..")
    except Exception:
        return False

In [5]:
# gets all product items from the current page, skipping extra rows, and labels sponsored and related products
# after the "see more related results" marker, all products are labeled as related

def get_items(driver):
    products = []
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "pt__link"))
        )
    except TimeoutException:
        print("search results did not load in time.")
        return products

    product_elements = driver.find_elements(By.CLASS_NAME, "pt")
    all_elements = driver.find_elements(By.XPATH, "//*")

    # flag to indicate if we've passed the related marker
    after_related_marker = False

    for el in all_elements:
        if is_related_marker(el):
            after_related_marker = True
        if el in product_elements:
            product = el
            try:
                if is_in_extra_row(product):
                    continue
                product_name = product.find_element(By.CLASS_NAME, "pt__link").get_attribute("title")
                product_price = product.find_element(By.CLASS_NAME, "pt__cost__retail-price").text
                label = ""
                if is_sponsored(product):
                    label = "Sponsored"
                elif after_related_marker:
                    label = "Related"
                products.append({
                    "Product Name": product_name,
                    "Price": product_price,
                    "Label": label
                })
            except NoSuchElementException as e:
                print(f"Error retrieving product details: {e}")
                continue

    return products

In [6]:
# this function rejects cookies by clicking the reject button if it appears
# does nothing if the banner is not found or already rejected

def reject_cookies(driver):
    try:
        WebDriverWait(driver, 7).until(
            EC.presence_of_element_located((By.ID, "onetrust-reject-all-handler"))
        ).click()
    except (TimeoutException, NoSuchElementException):
        print("Cookie banner not found or already rejected.")

In [7]:
# this function checks if an element exists on the page within a timeout
# returns true if the element is found, false otherwise

def element_exists(driver, by, value, timeout=5):
    try:
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
        return True
    except TimeoutException:
        return False

In [9]:
url =  "https://www.sainsburys.co.uk"

driver = webdriver.Chrome()
driver.get(url)

reject_cookies(driver)
search = driver.find_element(By.ID, "term")
search.clear()  # Clear the search field if needed

query = "banana"

for char in query:
    search.send_keys(char)  # Type each character into the search field
    time.sleep(0.2)  # Wait for half a second between keystrokes
search.send_keys(Keys.RETURN)   # Press Enter to submit the search

all_items = []

if element_exists(driver, By.CLASS_NAME, "ln-c-pagination__list"):
    num_pages = int(driver.find_element(By.CSS_SELECTOR, "[rel='last'].ln-c-pagination__link").text)
    for i in range(1, num_pages + 1):
        all_items.extend(get_items(driver))
        next_button = driver.find_element(By.CSS_SELECTOR, "[rel='next'].ln-c-pagination__link")
        next_button.click()
        time.sleep(4)
        
else:
    if "0" in driver.find_element(By.CSS_SELECTOR, "[data-testid='search-results-title']").text:
        print("There are not results for this search")
    else:
        all_items.extend(get_items(driver))

all_items_df = pd.DataFrame(all_items, columns=['Product Name', 'Price', 'Label'])

In [None]:
all_items_df

Unnamed: 0,Product Name,Price,Label
0,Organix Banana Soft Oaty Bars 12+ Months 6x23g,£3.30,Sponsored
1,Yazoo Banana Milk Drink 400ml,£1.50,Sponsored
2,Sainsbury's Fairtrade Bananas Loose,90p / kg,
3,Sainsbury's Fairtrade Bananas x5,78p,
4,Sainsbury's Fairtrade Bananas x8,£1.25,
...,...,...,...
144,Milupa Banana Bedtime 4-6+ Months 125g,£3.00,Related
145,Umberto Giannini Banana Coconut Detangler Leav...,£10.00,Related
146,Colourburst Tools Banana Shaper Orange,£2.00,Related
147,blu Pod Banana Ice Vape Pods 20mg/ml,£6.00,Related
