In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from random import randint
import json

## 1. Extracting Products from category pages

- We can extract all elements from subcategories at once, one after another
- Although we have to navigate to the bottom of the page to click on the next page arrow
- Still, we will miss some attributes from the furniture, such as the packages and weight.

In [12]:
output_path = 'output'
output_folder = output_path
# Load json file from ikea_product_categories.json
with open(os.path.join(output_path, 'ikea_product_categories.json'), encoding="utf-8") as f:
    target_categories = json.load(f)

# Check if there is a file named extracted_categories.json if not create it
# We will use this file to keep track of the categories we have already extracted
if not os.path.isfile(os.path.join(output_path, 'extracted_categories.json')):
    with open(os.path.join(output_path, 'extracted_categories.json'), 'w', encoding="utf-8") as f:
        json.dump([], f)
        extracted_categories = []
else:
    with open(os.path.join(output_path, 'extracted_categories.json'), encoding="utf-8") as f:
        extracted_categories = json.load(f)

# Do the same for call_record.json
if not os.path.isfile(os.path.join(output_path, 'request_records.json')):
    with open(os.path.join(output_path, 'request_records.json'), 'w', encoding="utf-8") as f:
        json.dump([], f)
        call_record = []
else:
    with open(os.path.join(output_path, 'request_records.json'), encoding="utf-8") as f:
        call_record = json.load(f)

In [6]:
# You have to download first a ChromeDriver which matches your Chrome version
# I installed it in my current workfolder as I am only gonna do this for this project
path = os.path.abspath(os.getcwd())
# Setup Chrome WebDriver
cService = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=cService)

In [7]:
def log_action(action, result, elapsed_time=None):
    log_entry = {
        "timestamp": time.strftime('%Y-%m-%d %H:%M:%S'),
        "action": action,
        "result": result,
        "time_taken": f"{elapsed_time:.2f}" if elapsed_time is not None else "N/A"
    }
    return log_entry

In [8]:
def extract_all_category_products(driver, target_category):
    """
    Extracts all products from a given category URL by scrolling down and clicking the "Show more" button until all products are loaded
    Inputs:
        driver (webdriver): Selenium WebDriver object
        target_category (dict): Dictionary with keys 'href' and 'text' for the category URL and name respectively
    Outputs:
        None
    Modified files:
        - request_records.json: Appends new log entries for each action performed
        - ikea_products.json: Appends product details for each product extracted
        - extracted_categories.json: Appends the target_category extracted with a new 'timestamp' key as long as products extracted were greater than 0
    """
    # Loop to click the "Show more" button until it's no longer available
    target_url = target_category["href"]
    driver.get(target_url)

    new_requests = []
    show_more_counter = 0 
    while True:
        try:
            # Record the start time for waiting for the button
            start_time = time.time()
            action = "Waiting for 'Show more' button"
            # Wait for the button to be clickable
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.plp-btn.plp-btn--small.plp-btn--secondary .plp-btn__label'))
            )
            end_time = time.time()
            elapsed_time = end_time - start_time
            new_requests.append(log_action(action, "Success", elapsed_time))

            # Record the start time for scrolling to the button
            start_time = time.time()
            action = "Scrolling to 'Show more' button"
            # Scroll to the button
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            end_time = time.time()
            elapsed_time = end_time - start_time
            new_requests.append(log_action(action, "Success", elapsed_time))

            # Record the start time for clicking the button
            start_time = time.time()
            action = "Clicking 'Show more' button"
            # Click the button
            driver.execute_script("arguments[0].click();", show_more_button)
            end_time = time.time()
            elapsed_time = end_time - start_time
            new_requests.append(log_action(action, "Success", elapsed_time))

            show_more_counter += 1

            # Wait for the next set of products to load
            time.sleep(randint(3, 7)) # Random timing, those intervals worked for me

        except Exception as e:
            new_requests.append(log_action(action, f"No more 'Show more' buttons or an error occurred: {e}"))
            break

    # We add new requests to the call_record but before it we set the url value for each new request
    for request in new_requests:
        request["url"] = target_url

    call_record.extend(new_requests)

    # We save the call_record to the request_records.json file
    with open(os.path.join(output_path, 'request_records.json'), 'w', encoding="utf-8") as f:
        json.dump(call_record, f, indent=4)

    # Use BeautifulSoup to extract product details once we loaded all inventory
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    products = soup.find_all('div', class_='plp-fragment-wrapper')
    products_list = []
    for product in products:
        try:
            url = product.find('a', class_='plp-price-link-wrapper link')["href"]
            name = product.find('span', class_='notranslate plp-price-module__product-name').get_text(strip=True)
            category_measures = product.find('span', class_='plp-price-module__description').get_text(strip=True)
            category = category_measures.split(',')[0]
            if len(category_measures.split(',')) > 1:
                measures_units = category_measures.split(',')[1].strip()
                measures = measures_units.split(' ')[0]
            else:
                measures = None
            price_section = product.find('span', {'aria-hidden': 'true'})
            price = ''.join([elem.get_text() for elem in price_section.find('span')]).replace(",€", "€")
        except Exception as e:
            continue  # Skips product if any info is missing

        product_data = {
            'url': url,
            'name': name,
            'category': category,
            'measures': measures,
            'price': price
        }

        products_list.append(product_data)

    if len(products_list) == 0:
        print(f"No products found for category {target_category['text']}")
        return
    
    # Save product details to a json file in output folder use utf-8 encoding
    output_file = os.path.join(output_folder, 'ikea_products.json')
    with open(output_file, 'a', encoding='utf-8') as f:
        json.dump(products_list, f, ensure_ascii=False, indent=4)

    # To make sure we do not extract the same category again we add it to the extracted_categories list
    # We update target_category with current timestamp and add it to the extracted_categories list
    target_category["timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S')
    extracted_categories.append(target_category)
    
    # We save the extracted_categories list to the extracted_categories.json file
    with open(os.path.join(output_path, 'extracted_categories.json'), 'w', encoding="utf-8") as f:
        json.dump(extracted_categories, f, indent=4)

In [None]:
extracted_categories_texts = [category["text"] for category in extracted_categories]
for target_category in target_categories:
    if target_category["text"] not in extracted_categories_texts:
        extract_all_category_products(target_category)
    else:
        print(f"Category {target_category['text']} already extracted, skipping...")
    
    # We reload extracted_categories
    with open(os.path.join(output_path, 'extracted_categories.json'), encoding="utf-8") as f:
        extracted_categories = json.load(f)