### Parameters

In [1]:
main_url = 'https://www.bruneau.fr/'
pages = [
    'https://www.bruneau.fr/catalog/bureaux/196105474o-jmbpr',
    'https://www.bruneau.fr/catalog/chaises-bureau/196105475o-jmbpr',
    'https://www.bruneau.fr/catalog/rangements/196105476o-jmbpr',
    'https://www.bruneau.fr/catalog/accessoires/196105477o-jmbpr',
]
save_path = 'C:\\Users\\quang\\Documents'
time_sleeps = 3 # wait 3 seconds before each action

### Import libraries

In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from typing import Union
import urllib.parse
import pandas as pd

In [3]:
pages_dict = []
for page in pages:
    relative_path = page.removeprefix(main_url)
    category = relative_path.split('/')[1].title()
    pages_dict.append((category, relative_path))
pages_dict

[('Bureaux', 'catalog/bureaux/196105474o-jmbpr'),
 ('Chaises-Bureau', 'catalog/chaises-bureau/196105475o-jmbpr'),
 ('Rangements', 'catalog/rangements/196105476o-jmbpr'),
 ('Accessoires', 'catalog/accessoires/196105477o-jmbpr')]

### Access to main page & close pop-ups

In [4]:
s=Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=s,options=options)
driver.maximize_window()
driver.get(main_url)

In [5]:
time.sleep(time_sleeps)
t = WebDriverWait(driver, time_sleeps).until(EC.element_to_be_clickable((By.XPATH, "//button[@id='onetrust-accept-btn-handler']"))).click()

In [6]:
time.sleep(2*time_sleeps)
WebDriverWait(driver, time_sleeps).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[class='isg-popin-close isg-button-reset']"))).click()

### Get data

In [7]:
def get_products(page_url):
    url = f'{main_url}{page_url}'
    driver.get(url)
    time.sleep(time_sleeps)
    try:
        product_more = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'isg-product-list-more')]")))
        product_more_text = product_more[0].text
        product_more_text = product_more_text.split('\n')[0].split(' ')[-1]
        total_products = int(product_more_text)
        total_pages = total_products//48 if total_products % 48 == 0 else total_products//48 + 1
    except:
        total_pages = 1
    
    product_names = []
    product_prices = []
    product_status = []
    product_urls = []
    
    for page_num in range(1, total_pages+1):
        if (page_num != 1):
            next_url = f'{main_url}{page_url}?page={page_num}'
            driver.get(next_url)
            time.sleep(time_sleeps)
        products = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'isg-product-list-item pure-u-1')]")))
        for product in products:
            title = product.find_elements(By.XPATH, ".//p[contains(@class, 'isg-catalog-product-title')]")
            if (len(title) > 0):
                product_names.append(title[0].text)
            price = product.find_elements(By.XPATH, ".//span[contains(@class, 'isg-price-value')]")
            if (len(price) > 0):
                product_prices.append(price[0].text)
            status = product.find_elements(By.XPATH, ".//span[contains(@class, 'isg-product-stock')]")
            if (len(status) > 0):
                product_status.append(status[0].text)
            product_url = product.find_elements(By.XPATH, ".//a[contains(@class, 'isg-catalog-product-top')]")
            if (len(product_url) > 0):
                product_link = product_url[0].get_attribute('href').split('#')[0].split('?')[0]
                product_urls.append(product_link)
    data = {
        'product_name': product_names,
        'product_price': product_prices,
        'product_status': product_status,
        'product_url': product_urls
    }
    data = pd.DataFrame(data, columns=['product_name', 'product_price', 'product_status', 'product_url'])
    return data

In [8]:
product_datas = []
for category, page_url in pages_dict:
    product_data = get_products(page_url)
    product_data['category'] = category
    product_datas.append(product_data)
all_data = pd.concat(product_datas, ignore_index = True, axis=0)
all_data

Unnamed: 0,product_name,product_price,product_status,product_url,category
0,Bureau Biblioffice L 140cm 9 cases,"295,00 € ht",En stock,https://www.bruneau.fr/product/bureau-biblioff...,Bureaux
1,Bureau Biblioffice L 140cm 6 cases,"259,00 € ht",En stock,https://www.bruneau.fr/product/bureau-biblioff...,Bureaux
2,Bureau Yale L 114 cm avec étagère chêne,"129,00 € ht",En stock,https://www.bruneau.fr/product/bureau-yale-l-1...,Bureaux
3,Bureau Designy,"149,00 € ht",En stock,https://www.bruneau.fr/product/bureau-designy/...,Bureaux
4,Bureau Horizon chêne clair L 134 cm piètement ...,"275,00 € ht",En stock,https://www.bruneau.fr/product/bureau-horizon-...,Bureaux
...,...,...,...,...,...
122,Plante artificielle d’intérieur haie de Bambou...,"625,00 € ht",Arrivage,https://www.bruneau.fr/product/plante-artifici...,Accessoires
123,Bras support écran LX - Ergotron,"215,00 € ht",Arrivage,https://www.bruneau.fr/product/bras-support-ec...,Accessoires
124,Chargeur induction sans fil - technologie QI,"62,95 € ht",En stock,https://www.bruneau.fr/product/chargeur-induct...,Accessoires
125,Multiprise encastrable avec couvercle coulissa...,"73,50 € ht",En stock,https://www.bruneau.fr/product/multiprise-enca...,Accessoires


In [9]:
all_data.to_excel(f'{save_path}\\all_data.xlsx')