In [1]:
import time
import requests
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Set the product URL
product_url = 'https://www.ocado.com/on-offer?filters=gluten-free-19993'

# Create a fake user agent object
user_agent = UserAgent()

# Create a new Chrome instance
driver = webdriver.Chrome()

# Open the product page
driver.get(product_url)

# Wait for the page to load completely
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//body')))

# Scroll down the page slowly until the end
scroll_height = driver.execute_script("return document.body.scrollHeight")
while scroll_height > 0:
    # Scroll down by 100 pixels
    driver.execute_script("window.scrollBy(0, 100);")

    # Decrement the scroll height
    scroll_height -= 100

    # Add a delay to make the scrolling smoother
    time.sleep(0.05)

# Get the HTML content
html = driver.page_source

# Parse the HTML content
soup = BeautifulSoup(html, 'lxml')

# Find all fop-contentWrapper elements which are children of fops fops-regular fops-shelf div elements
fop_content_wrappers = soup.find_all('div', class_='fop-contentWrapper', parent=soup.find_all('div', class_='fops fops-regular fops-shelf'))

# Extract URLs from fop-contentWrapper links
hrefs = []
for fop_content_wrapper in fop_content_wrappers:
    hrefs.extend([link['href'] for link in fop_content_wrapper.find_all('a')])

# Print the extracted URLs
print(len(hrefs))
print(hrefs)

800
['/products/m-s-british-outdoor-bred-pigs-in-blankets-530550011', '/offers/clearance-save-half-price-was-4-50-237491983', '/products/ocean-spray-wholeberry-cranberry-sauce-65503011', '/offers/save-75p-was-1-75-237091335', '/products/m-s-collection-pigs-in-blankets-533223011', '/offers/clearance-save-half-price-was-5-50-237492010', '/products/ambrosia-devon-custard-10767011', '/offers/buy-any-2-for-2-236789547', '/products/diet-coke-28806011', '/offers/buy-any-2-for-16-236003085', '/products/diet-coke-16201011', '/offers/buy-any-2-for-3-50-235439171', '/products/tyrrells-furrows-sea-salted-sharing-crisps-60363011', '/offers/now-1-75-was-2-75-235540883', '/products/pipers-anglesey-sea-salt-sharing-bag-crisps-467842011', '/offers/now-2-was-3-233420190', '/products/shloer-white-grape-sparkling-juice-drink-39114011', '/offers/half-price-was-2-65-235209423', '/products/m-s-4-aberdeen-angus-burgers-505187011', '/offers/buy-any-3-for-12-237146355', '/products/coca-cola-original-taste-67468

In [2]:
def remove_offers(array):
    filtered_array = []
    for item in array:
        if not item.startswith("/offers/"):
            filtered_array.append(item)
    return filtered_array

filtered_array = remove_offers(hrefs)
print(len(filtered_array))
print(filtered_array)


405
['/products/m-s-british-outdoor-bred-pigs-in-blankets-530550011', '/products/ocean-spray-wholeberry-cranberry-sauce-65503011', '/products/m-s-collection-pigs-in-blankets-533223011', '/products/ambrosia-devon-custard-10767011', '/products/diet-coke-28806011', '/products/diet-coke-16201011', '/products/tyrrells-furrows-sea-salted-sharing-crisps-60363011', '/products/pipers-anglesey-sea-salt-sharing-bag-crisps-467842011', '/products/shloer-white-grape-sparkling-juice-drink-39114011', '/products/m-s-4-aberdeen-angus-burgers-505187011', '/products/coca-cola-original-taste-67468011', '/products/m-s-british-pork-sage-onion-stuffing-530561011', '/products/stokes-cranberry-sauce-70103011', '/products/heinz-baked-beans-12279011', '/products/alpro-almond-no-sugars-long-life-drink-77181011', '/products/pipers-lye-cross-cheddar-onion-sharing-bag-crisps-467832011', '/products/coca-cola-original-taste-26267011', '/products/edwards-traditional-pork-sausage-meat-480025011', '/products/shloer-rose-s

In [3]:
base_url = 'https://www.ocado.com/'
counter = 0

for product_url in filtered_array:
    # Send the request with a random user agent string
    response = requests.get(base_url + product_url, headers={'User-Agent': user_agent.chrome})

    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'lxml')

        # Find product_name
        product_name_element = soup.find('h1')
        if product_name_element:
            product_name = product_name_element.text
        else:
            print('Error: Product name not found on page:', base_url + product_url)
            continue

        # Find reviews
        review_wrappers = soup.find_all('div', class_='gn-card bop-reviews__review')

        many = 0
        for review_wrapper in review_wrappers:

            rating_meta = review_wrapper.find('meta', itemprop='ratingValue')
            rating = rating_meta['content'] if rating_meta else 'N/A'

            review_heading_element = review_wrapper.find('h6')
            review_heading = review_heading_element.text if review_heading_element else 'N/A'

            review_text_element = review_wrapper.find('p')
            review_text = review_text_element.text if review_text_element else 'N/A'

            date_meta = review_wrapper.find('meta', itemprop='datePublished')
            date = date_meta['content'] if date_meta else 'N/A'

            try:
                with open('ocadoReview.txt', 'a', encoding='utf-8') as f:
                    # Write the reviews to the file
                    f.write(f"Product Name: {product_name}\n")
                    f.write(f"Review Rating: {rating}\n")
                    f.write(f"Review Heading: {review_heading}\n")
                    f.write(f"Review Text: {review_text}\n")
                    f.write(f"Review Date: {date}\n")
                    f.write("------------------------------------\n")
            except FileNotFoundError:
                # Create the file if it does not exist
                with open('ocadoReview.txt', 'w', encoding='utf-8') as f:
                    # Write the reviews to the file
                    f.write(f"Product Name: {product_name}\n")
                    f.write(f"Review Rating: {rating}\n")
                    f.write(f"Review Heading: {review_heading}\n")
                    f.write(f"Review Text: {review_text}\n")
                    f.write("------------------------------------\n")

            many += 1
            counter += 1
            if many == 2:
                break
    else:
        print('Error fetching product page:', response.status_code)

In [4]:
print(counter)

797
