In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
from webdriver_manager.chrome import ChromeDriverManager
import time

## 1. Extracting Categories

In [72]:
# You have to download first a ChromeDriver which matches your Chrome version
# I installed it in my current workfolder as I am only gonna do this for this project
path = os.path.abspath(os.getcwd())
# Setup Chrome WebDriver
cService = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=cService)

# Navigate to the IKEA website
driver.get('https://www.ikea.com/es/')

# Wait for the page to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, 'hnf-carousel__tabs-navigation-products'))
)

# We expand the window to fit full screen
driver.maximize_window()


In [73]:
# Check for and handle the cookie consent banner
try:
    # Wait for the cookie consent banner to be visible
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "onetrust-banner-sdk"))
    )
    # Click the accept button
    accept_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    accept_button.click()
except:
    # If the banner does not appear, handle the exception (it might not be there)
    print("No cookie consent banner found.")

# We zoom out to see the whole page
#driver.execute_script("document.body.style.zoom='50%'")

In [74]:
# Find the div and select the first link element
product_tab = driver.find_element(By.ID, 'hnf-carousel__tabs-navigation-products')
all_links = product_tab.find_elements(By.TAG_NAME, 'a')

In [78]:
visited_links = set()  # To keep track of visited links
ikea_product_categories = []
n_visited_links = 0
failed_links = []

In [79]:
for link in all_links:
    print("extracting categories from: ", link.text)
    try:
        link.click()
    except:
        print("Could not click on the link number ", str(n_visited_links))
        failed_links.append(link)
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.hnf-dropdown__columns a, .hnf-dropdown__column a'))
        )
    except:
        print("No categories found for this link")

    # Use BeautifulSoup to parse the updated page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all <a> elements within elements of class 'hnf-dropdown__columns'
    links = soup.select('.hnf-dropdown__columns a, .hnf-dropdown__column a')
    link_data = [{'href': link['href'], 'text': link.get_text(strip=True)} for link in links if link.has_attr('href')]
    
    ikea_product_categories.extend(link_data)
    
    time.sleep(2)
    n_visited_links += 1

extracting categories from:  Almacenaje y organización
extracting categories from:  Muebles
extracting categories from:  Productos de jardín y terraza
extracting categories from:  Macetas, plantas y jardinería
extracting categories from:  Muebles de baño y accesorios
extracting categories from:  Cocinas y electrodomésticos
extracting categories from:  Menaje para cocinar, comer y beber
extracting categories from:  Textiles del hogar
extracting categories from:  Decoración y espejos
extracting categories from:  Iluminación
Could not click on the link number  9
extracting categories from:  Bebés, niños y niñas
extracting categories from:  Camas y colchones
Could not click on the link number  11
extracting categories from:  Alfombras, felpudos y suelos
extracting categories from:  Colada y limpieza
Could not click on the link number  13
extracting categories from:  Accesorios para mascotas
extracting categories from:  Hogar inteligente y tecnología
Could not click on the link number  15
e

In [None]:
# We manually select the arrow to show the categories which failed

In [80]:
for link in failed_links:
    print("extracting categories from: ", link.text)
    try:
        link.click()
    except:
        print(f"Could not click on the link fo {link.text}")
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.hnf-dropdown__columns a, .hnf-dropdown__column a'))
        )
    except:
        print("No categories found for this link")

    # Use BeautifulSoup to parse the updated page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all <a> elements within elements of class 'hnf-dropdown__columns'
    links = soup.select('.hnf-dropdown__columns a, .hnf-dropdown__column a')
    link_data = [{'href': link['href'], 'text': link.get_text(strip=True)} for link in links if link.has_attr('href')]
    
    ikea_product_categories.extend(link_data)
    
    time.sleep(2)
    n_visited_links += 1

extracting categories from:  Iluminación
extracting categories from:  Camas y colchones
extracting categories from:  Colada y limpieza
extracting categories from:  Hogar inteligente y tecnología
extracting categories from:  Alimentos suecos y bebidas


In [83]:
#Save product categories to a json file in output folder use utf-8 encoding
import json
output_folder = os.path.join(path, 'output')
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, 'ikea_product_categories.json')
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(ikea_product_categories, f, ensure_ascii=False, indent=4)