In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import os
from webdriver_manager.chrome import ChromeDriverManager
import time

## 1. Extracting Categories

In [72]:
# You have to download first a ChromeDriver which matches your Chrome version
# I installed it in my current workfolder as I am only gonna do this for this project
path = os.path.abspath(os.getcwd())
# Setup Chrome WebDriver
cService = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=cService)

# Navigate to the IKEA website
driver.get('https://www.ikea.com/es/')

# Wait for the page to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, 'hnf-carousel__tabs-navigation-products'))
)

# We expand the window to fit full screen
driver.maximize_window()


In [73]:
# Check for and handle the cookie consent banner
try:
    # Wait for the cookie consent banner to be visible
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "onetrust-banner-sdk"))
    )
    # Click the accept button
    accept_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    accept_button.click()
except:
    # If the banner does not appear, handle the exception (it might not be there)
    print("No cookie consent banner found.")

# We zoom out to see the whole page
#driver.execute_script("document.body.style.zoom='50%'")

In [74]:
# Find the div and select the first link element
product_tab = driver.find_element(By.ID, 'hnf-carousel__tabs-navigation-products')
all_links = product_tab.find_elements(By.TAG_NAME, 'a')

In [78]:
visited_links = set()  # To keep track of visited links
ikea_product_categories = []
n_visited_links = 0
failed_links = []

In [79]:
for link in all_links:
    print("extracting categories from: ", link.text)
    try:
        link.click()
    except:
        print("Could not click on the link number ", str(n_visited_links))
        failed_links.append(link)
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.hnf-dropdown__columns a, .hnf-dropdown__column a'))
        )
    except:
        print("No categories found for this link")

    # Use BeautifulSoup to parse the updated page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all <a> elements within elements of class 'hnf-dropdown__columns'
    links = soup.select('.hnf-dropdown__columns a, .hnf-dropdown__column a')
    link_data = [{'href': link['href'], 'text': link.get_text(strip=True)} for link in links if link.has_attr('href')]
    
    ikea_product_categories.extend(link_data)
    
    time.sleep(2)
    n_visited_links += 1

extracting categories from:  Almacenaje y organización
extracting categories from:  Muebles
extracting categories from:  Productos de jardín y terraza
extracting categories from:  Macetas, plantas y jardinería
extracting categories from:  Muebles de baño y accesorios
extracting categories from:  Cocinas y electrodomésticos
extracting categories from:  Menaje para cocinar, comer y beber
extracting categories from:  Textiles del hogar
extracting categories from:  Decoración y espejos
extracting categories from:  Iluminación
Could not click on the link number  9
extracting categories from:  Bebés, niños y niñas
extracting categories from:  Camas y colchones
Could not click on the link number  11
extracting categories from:  Alfombras, felpudos y suelos
extracting categories from:  Colada y limpieza
Could not click on the link number  13
extracting categories from:  Accesorios para mascotas
extracting categories from:  Hogar inteligente y tecnología
Could not click on the link number  15
e

In [None]:
# We manually select the arrow to show the categories which failed

In [80]:
for link in failed_links:
    print("extracting categories from: ", link.text)
    try:
        link.click()
    except:
        print(f"Could not click on the link fo {link.text}")
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.hnf-dropdown__columns a, .hnf-dropdown__column a'))
        )
    except:
        print("No categories found for this link")

    # Use BeautifulSoup to parse the updated page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all <a> elements within elements of class 'hnf-dropdown__columns'
    links = soup.select('.hnf-dropdown__columns a, .hnf-dropdown__column a')
    link_data = [{'href': link['href'], 'text': link.get_text(strip=True)} for link in links if link.has_attr('href')]
    
    ikea_product_categories.extend(link_data)
    
    time.sleep(2)
    n_visited_links += 1

extracting categories from:  Iluminación
extracting categories from:  Camas y colchones
extracting categories from:  Colada y limpieza
extracting categories from:  Hogar inteligente y tecnología
extracting categories from:  Alimentos suecos y bebidas


In [84]:
len(ikea_product_categories)

214

In [83]:
#Save product categories to a json file in output folder use utf-8 encoding
import json
output_folder = os.path.join(path, 'output')
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, 'ikea_product_categories.json')
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(ikea_product_categories, f, ensure_ascii=False, indent=4)

In [85]:
driver.quit()

## 2. Extracting Subcategories

- We can extract all elements from subcategories at once, one after another
- Although we have to navigate to the bottom of the page to click on the next page arrow
- Still, we will miss some attributes from the furniture, such as the packages and weight.

In [52]:
from random import randint

In [None]:
# Loop to click the "Show more" button until it's no longer available
while True:
    try:
        # Wait for the button to be clickable
        show_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.plp-btn.plp-btn--small.plp-btn--secondary .plp-btn__label'))
        )
        # Scroll to the button and click it
        driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
        driver.execute_script("arguments[0].click();", show_more_button)
        # Wait for the next set of products to load
        time.sleep(randint(3, 7)) # Random timing, those intervals worked for me
    except Exception as e:
        print("No more 'Show more' buttons or an error occurred:", e)
        break

In [83]:
# Use BeautifulSoup to extract product details once we loaded all inventory
soup = BeautifulSoup(driver.page_source, 'html.parser')
products = soup.find_all('div', class_='plp-fragment-wrapper')
products_list = []
for product in products:
    try:
        url = product.find('a', class_='plp-price-link-wrapper link')["href"]
        name = product.find('span', class_='notranslate plp-price-module__product-name').get_text(strip=True)
        category_measures = product.find('span', class_='plp-price-module__description').get_text(strip=True)
        category = category_measures.split(',')[0]
        if len(category_measures.split(',')) > 1:
            measures_units = category_measures.split(',')[1].strip()
            measures = measures_units.split(' ')[0]
            units = measures_units.split(' ')[1]
        else:
            measures = None
            units = None
        price_section = product.find('span', {'aria-hidden': 'true'})
        price = ''.join([elem.get_text() for elem in price_section.find('span')]).replace(",€", "€")
    except AttributeError:
        continue  # Skips product if any info is missing

    product_data = {
        'url': url,
        'name': name,
        'category': category,
        'measures': measures,
        'price': price
    }

    products_list.append(product_data)


In [85]:
# Save product details to a json file in output folder use utf-8 encoding
output_file = os.path.join(output_folder, 'ikea_products.json')
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(products_list, f, ensure_ascii=False, indent=4)

In [87]:
driver.quit()

## 3. Extracting product specific attributes (number of packages it is made of, weight and dimensions of the package)

- We should not need selenium for this, we can just make requests as long as we have a cookie.
- We may get blocked soon because they have Akamai protecion implemented 
- May need to generate access cookies tokens manually and paste them here
- Can just query the endpoint /es/es/p/product-name-productnumber (just the url on ikea_products.json)

In [100]:
import requests
from bs4 import BeautifulSoup
import json

# Target URL
# We will have to iterate through this
url = 'https://www.ikea.com/es/es/p/malm-comoda-6-cajones-blanco-60403584/'

# Custom headers based on your specification
headers = {
    'Host': 'www.ikea.com',
    'Sec-Ch-Ua': '"Not-A.Brand";v="99", "Chromium";v="124"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'es-ES,es;q=0.9',
    'Connection': 'close'
}

# Custom cookies from your example
cookies = {
    'guest': 'eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6ImVxSFFLR3duR3hfV3dJZkx0RGpaeDA5MTUzS2xSam5fVE1nVUlMYlJ5RncifQ.eyJpc3MiOiJodHRwczovL2FwaS5pbmdrYS5pa2VhLmNvbS9ndWVzdCIsInN1YiI6ImIwMDJmYTBkLTkwMjEtNGQ2My04YzlkLTJhZDNlZjM0ZjE0YiIsInJldGFpbFVuaXQiOiJlcyIsImlhdCI6MTcxNTQ1MTY0MCwiZXhwIjoxNzE4MDQzNjQwfQ.NS0sPhnYbArE-750pTzE4_5I6wsCRgZxlKa0Cfyf0Z4bKi5jsfvcFWRS88jz6d0O1z6wLmfN5XN0tseCMzouaeLW_jmxGQ5qQF1_9J9uJny7R3t37Ku_lT41Psbu1ymQra2cLOZWwbungu41bNClWd3p3k4NaOXWx2fcivZrFUo'
}

# Make the GET request
# We may need to remove headers, or iterate using different profiles
response = requests.get(url, headers=headers, cookies=cookies)

# Check the response
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    # extracting JSON object from a script tag within the HTML
    product_div = soup.find('div', class_='pip-product__subgrid product-pip js-product-pip')
    if product_div:
        # Extract the 'data-hydration-props' attribute, which contains JSON
        json_data_attr = product_div['data-hydration-props']
        
        # Convert HTML entities and escaped quotes to normal form
        json_data_attr = json_data_attr.replace('&quot;', '"')
        
        # Load string into a JSON object
        data = json.loads(json_data_attr)

        product_dimensions_json = data["productInformationSection"]["dimensionProps"]["packaging"]
        # We have to extract the dimensions from the JSON object because one object may have more than one package
        # We will likely need to extract other attributes we may need in the future as well
        
else:
    print("Failed to retrieve the page, status code:", response.status_code)


{'id': 'measurements-packaging',
 'title': 'Embalaje',
 'contentProps': {'packages': [{'name': 'MALM',
    'typeName': 'Cómoda 6 cajones',
    'itemNo': '60403584',
    'articleNumber': {'label': 'Número de artículo', 'value': '604.035.84'},
    'measurements': [[{'label': 'Ancho',
       'type': 'width',
       'text': '50 cm',
       'value': 50},
      {'label': 'Alto', 'type': 'height', 'text': '8 cm', 'value': 8},
      {'label': 'Largo', 'type': 'length', 'text': '169 cm', 'value': 169},
      {'label': 'Peso', 'type': 'weight', 'text': '35.55 kg', 'value': 35.55}],
     [{'label': 'Ancho', 'type': 'width', 'text': '46 cm', 'value': 46},
      {'label': 'Alto', 'type': 'height', 'text': '10 cm', 'value': 10},
      {'label': 'Largo', 'type': 'length', 'text': '84 cm', 'value': 84},
      {'label': 'Peso', 'type': 'weight', 'text': '17.45 kg', 'value': 17.45}],
     [{'label': 'Ancho', 'type': 'width', 'text': '46 cm', 'value': 46},
      {'label': 'Alto', 'type': 'height', 'text'