In [14]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Configuration
BASE_URL = "https://www.olfastory.com"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
HEADERS = {"User-Agent": USER_AGENT}

# Target categories (based on provided HTML and previous context)
CATEGORIES = ["/parfum/homme", "/parfum/femme", "/parfums/mixte"]


In [7]:
# List to store all perfume data
perfumes_data = []

# Function to extract details from a perfume's main page
def extract_perfume_details(perfume_url):
    try:
        response = requests.get(perfume_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Erreur {response.status_code} pour {perfume_url}")
            return {
                "concentration": "Inconnu",
                "facettes": "Inconnu",
                "date_creation": "Inconnu",
                "parfumeurs": "Inconnu",
                "description_title": "Inconnu",
                "description_text": "Inconnu"
            }

        soup = BeautifulSoup(response.text, "html.parser")
        details = {}

        # Find the details section
        details_section = soup.find("div", class_="c-list__group")
        if details_section:
            # Concentration
            concentration_tag = details_section.find("div", class_="c-list__label", string="Concentration")
            if concentration_tag:
                concentration = concentration_tag.find_next("div", class_="c-list__content").find("li")
                details["concentration"] = concentration.text.strip() if concentration else "Inconnu"
            else:
                details["concentration"] = "Inconnu"

            # Facettes
            facettes_tag = details_section.find("div", class_="c-list__label", string="Facettes")
            if facettes_tag:
                facettes = [li.text.strip() for li in facettes_tag.find_next("div", class_="c-list__content").find_all("li")]
                details["facettes"] = ";".join(facettes) if facettes else "Inconnu"
            else:
                details["facettes"] = "Inconnu"

            # Date de création
            date_tag = details_section.find("div", class_="c-list__label", string="Date de création")
            if date_tag:
                date = date_tag.find_next("div", class_="c-list__content").find("li")
                details["date_creation"] = date.text.strip() if date else "Inconnu"
            else:
                details["date_creation"] = "Inconnu"

            # Parfumeurs
            parfumeurs_tag = details_section.find("div", class_="c-list__label", string="Parfumeurs")
            if parfumeurs_tag:
                parfumeurs = [li.text.strip() for li in parfumeurs_tag.find_next("div", class_="c-list__content").find_all("li")]
                details["parfumeurs"] = ";".join(parfumeurs) if parfumeurs else "Inconnu"
            else:
                details["parfumeurs"] = "Inconnu"
        else:
            details.update({
                "concentration": "Inconnu",
                "facettes": "Inconnu",
                "date_creation": "Inconnu",
                "parfumeurs": "Inconnu"
            })

        # Description (title and text)
        content_section = soup.find("div", class_="c-tab__content")
        if content_section:
            # Try h2 or h3 for title
            title_tag = content_section.find(["h2", "h3"])
            details["description_title"] = title_tag.text.strip() if title_tag else "Inconnu"

            # Find the description text (first <p> after title or within content)
            description_tag = content_section.find("p", recursive=True)
            details["description_text"] = description_tag.text.strip() if description_tag else "Inconnu"
        else:
            details["description_title"] = "Inconnu"
            details["description_text"] = "Inconnu"

        return details

    except Exception as e:
        print(f"Erreur lors de l'accès à {perfume_url}: {e}")
        return {
            "concentration": "Inconnu",
            "facettes": "Inconnu",
            "date_creation": "Inconnu",
            "parfumeurs": "Inconnu",
            "description_title": "Inconnu",
            "description_text": "Inconnu"
        }

# Function to extract composition from a perfume's composition page
def extract_composition(composition_url):
    try:
        response = requests.get(composition_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Erreur {response.status_code} pour {composition_url}")
            return {
                "notes_tete": "Inconnu",
                "notes_coeur": "Inconnu",
                "notes_fond": "Inconnu"
            }

        soup = BeautifulSoup(response.text, "html.parser")
        composition = {}

        # Find all pyramid areas
        pyramid_areas = soup.find_all("div", class_="c-pyramide__area")
        for area in pyramid_areas:
            label = area.find("div", class_="c-pyramide__label")
            if label:
                label_text = label.text.strip()
                notes = [li.text.strip() for li in area.find_all("li")]
                notes_text = ";".join(notes) if notes else "Inconnu"
                if "Notes de tête" in label_text:
                    composition["notes_tete"] = notes_text
                elif "Notes de coeur" in label_text:
                    composition["notes_coeur"] = notes_text
                elif "Notes de fond" in label_text:
                    composition["notes_fond"] = notes_text

        # Ensure all fields are present
        composition.setdefault("notes_tete", "Inconnu")
        composition.setdefault("notes_coeur", "Inconnu")
        composition.setdefault("notes_fond", "Inconnu")

        return composition

    except Exception as e:
        print(f"Erreur lors de l'accès à {composition_url}: {e}")
        return {
            "notes_tete": "Inconnu",
            "notes_coeur": "Inconnu",
            "notes_fond": "Inconnu"
        }

# Function to extract perfumes from a category page
def extract_perfumes(category_url, category_name, page_number):
    try:
        # Construct URL
        if page_number == 0:
            url = category_url
        else:
            url = f"{category_url}?page={page_number - 1}"

        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Erreur {response.status_code} pour {url}")
            return False, []

        soup = BeautifulSoup(response.text, "html.parser")
        perfume_cards = soup.find_all("div", class_="c-product__card")
        if not perfume_cards:
            print(f"Aucun parfum trouvé pour {url}")
            return False, []

        print(f"\n=== Parfums de la catégorie : {url} (Page {page_number}) ===\n")
        page_perfumes = []

        for card in perfume_cards:
            perfume = {}

            # Name and Perfume URL
            name_tag = card.find("div", class_="c-product__title").find("a") if card.find("div", class_="c-product__title") else None
            perfume["name"] = name_tag.text.strip() if name_tag else "Inconnu"
            perfume["perfume_url"] = name_tag["href"] if name_tag and "href" in name_tag.attrs else "Inconnu"

            # Brand
            brand_tag = card.find("span", class_="c-product__brand")
            perfume["brand"] = brand_tag.text.strip() if brand_tag else "Inconnu"

            # Type and Year
            details_tag = card.find("div", class_="c-product__details")
            if details_tag:
                details_text = details_tag.text.strip()
                details_parts = details_text.split(" - ")
                perfume["type"] = details_parts[0] if len(details_parts) > 0 else "Inconnu"
                perfume["year"] = details_parts[1] if len(details_parts) > 1 else "Inconnu"
            else:
                perfume["type"] = "Inconnu"
                perfume["year"] = "Inconnu"

            perfume["gender"] = category_name.capitalize()

            # Olfactory Family
            family_tag = card.find("div", class_="c-product__family").find("span") if card.find("div", class_="c-product__family") else None
            perfume["family"] = family_tag.text.strip() if family_tag else "Inconnu"

            # Image URL
            img_tag = card.find("img", class_="lazyload")
            perfume["image_url"] = img_tag["data-src"] if img_tag and "data-src" in img_tag.attrs else "Inconnu"

            # Fetch additional details from perfume page
            if perfume["perfume_url"] != "Inconnu":
                details = extract_perfume_details(perfume["perfume_url"])
                perfume.update(details)

                # Fetch composition from composition page
                composition_url = f"{perfume['perfume_url']}/composition/#o"
                composition = extract_composition(composition_url)
                perfume.update(composition)
            else:
                perfume.update({
                    "concentration": "Inconnu",
                    "facettes": "Inconnu",
                    "date_creation": "Inconnu",
                    "parfumeurs": "Inconnu",
                    "description_title": "Inconnu",
                    "description_text": "Inconnu",
                    "notes_tete": "Inconnu",
                    "notes_coeur": "Inconnu",
                    "notes_fond": "Inconnu"
                })

            # Print perfume data
            print("Parfum trouvé :")
            print(f"  Nom : {perfume['name']}")
            print(f"  URL : {perfume['perfume_url']}")
            print(f"  Marque : {perfume['brand']}")
            print(f"  Type : {perfume['type']}")
            print(f"  Année : {perfume['year']}")
            print(f"  Genre : {perfume['gender']}")
            print(f"  Famille olfactive : {perfume['family']}")
            print(f"  URL de l'image : {perfume['image_url']}")
            print(f"  Concentration : {perfume['concentration']}")
            print(f"  Facettes : {perfume['facettes']}")
            print(f"  Date de création : {perfume['date_creation']}")
            print(f"  Parfumeurs : {perfume['parfumeurs']}")
            print(f"  Titre de description : {perfume['description_title']}")
            print(f"  Texte de description : {perfume['description_text']}")
            print(f"  Notes de tête : {perfume['notes_tete']}")
            print(f"  Notes de cœur : {perfume['notes_coeur']}")
            print(f"  Notes de fond : {perfume['notes_fond']}")
            print("-" * 50)

            page_perfumes.append(perfume)
            time.sleep(1)  # Delay for individual perfume page requests

        # Check for next page
        next_page = soup.find("li", class_="pager-next")  # Adapt selector if needed
        has_next_page = next_page and "disabled" not in next_page.get("class", [])

        return has_next_page, page_perfumes

    except Exception as e:
        print(f"Erreur lors de l'accès à {url}: {e}")
        return False, []

# Main scraping logic
try:
    response = requests.get(BASE_URL, headers=HEADERS)
    if response.status_code != 200:
        print(f"Erreur {response.status_code} pour {BASE_URL}")
        exit()

    soup = BeautifulSoup(response.text, "html.parser")
    category_links = soup.find_all("li", class_="js-equalizer__item c-nav__nosubmenu")
    found_categories = []

    for li in category_links:
        a_tag = li.find("a", href=True)
        if a_tag and a_tag["href"] in CATEGORIES:
            category_href = a_tag["href"]
            found_categories.append(category_href)

    print("\nCatégories trouvées dans le menu de navigation :")
    if found_categories:
        for cat in found_categories:
            print(f"- {cat}")
    else:
        print("Aucune catégorie correspondante trouvée. Utilisation des catégories prédéfinies.")

    categories_to_scrape = found_categories if found_categories else CATEGORIES

    for category in categories_to_scrape:
        full_url = BASE_URL + category
        category_name = category.split("/")[-1]
        page_number = 0

        while True:
            print(f"\nScraping {full_url} (Page {page_number})")
            has_next_page, page_perfumes = extract_perfumes(full_url, category_name, page_number)
            perfumes_data.extend(page_perfumes)

            if not has_next_page:
                break

            page_number += 1
            time.sleep(0.5)  # Delay for category page requests

    # Create DataFrame
    df = pd.DataFrame(perfumes_data)
    if not df.empty:
        columns = [
            "name", "perfume_url", "brand", "type", "year", "gender", "family", "image_url",
            "concentration", "facettes", "date_creation", "parfumeurs",
            "description_title", "description_text",
            "notes_tete", "notes_coeur", "notes_fond"
        ]
        df = df[columns]
        # Save to CSV
        df.to_csv("parfums.csv", index=False, encoding="utf-8")
        # Save to Excel
        df.to_excel("parfums.xlsx", index=False, engine="openpyxl")
        print(f"\nBase de données sauvegardée dans 'parfums.csv' et 'parfums.xlsx' avec {len(df)} parfums.")
    else:
        print("\nAucun parfum extrait. Vérifiez les sélecteurs ou le contenu dynamique.")

except Exception as e:
    print(f"Erreur lors de l'accès à {BASE_URL}: {e}")


Catégories trouvées dans le menu de navigation :
- /parfum/homme
- /parfum/femme
- /parfums/mixte

Scraping https://www.olfastory.com/parfum/homme (Page 0)

=== Parfums de la catégorie : https://www.olfastory.com/parfum/homme (Page 0) ===

Parfum trouvé :
  Nom : Acqua di Giò Profondo
  URL : https://www.olfastory.com/parfum/acqua-di-gio-profondo-0
  Marque : Armani
  Type : Eau de toilette
  Année : 2025
  Genre : Homme
  Famille olfactive : Aromatique
  URL de l'image : https://www.olfastory.com/sites/www.olfastory.com/files/styles/300x300/public/acqua-di-gio-profondo-eau-de-toilette.jpg?itok=N2WigyFu
  Concentration : Eau de toilette
  Facettes : Marine
  Date de création : 2025
  Parfumeurs : Alberto Morillas
  Titre de description : Présentation du parfum
  Texte de description : Présentation du parfumImmersion dans le bleu profondSortie printemps 2025, cette Eau de Toilette revisite la signature marine d’Armani : plus aérienne que l’édition Parfum, elle retranscrit l’instant où 

In [24]:
df.head()

Unnamed: 0,name,perfume_url,brand,type,year,gender,family,image_url,concentration,facettes,date_creation,parfumeurs,description_title,description_text,notes_tete,notes_coeur,notes_fond
0,Acqua di Giò Profondo,https://www.olfastory.com/parfum/acqua-di-gio-...,Armani,Eau de toilette,2025,Homme,Aromatique,https://www.olfastory.com/sites/www.olfastory....,Eau de toilette,Marine,2025,Alberto Morillas,Présentation du parfum,Présentation du parfumImmersion dans le bleu p...,Cardamome;Citron;Mandarine;Pamplemousse,Cyprès;Lavandin;Note marine;Sauge sclarée,Musc;Patchouli
1,L.12.12 Silver Grey,https://www.olfastory.com/parfum/l1212-silver-...,Lacoste,Eau de parfum,2025,Homme,Fougère,https://www.olfastory.com/sites/www.olfastory....,Eau de parfum,Boisée,2025,Bruno Jovanovic,Présentation du parfum Lacoste L.12.12 Silver ...,Présentation du parfum Lacoste L.12.12 Silver ...,Encens;Mandarine,Géranium;Lavande,Ambroxan;Vétiver
2,Original,https://www.olfastory.com/parfum/original,Lacoste,Extrait,2025,Homme,Fougère,https://www.olfastory.com/sites/www.olfastory....,Extrait,Ambrée;Epicée,2025,Inconnu,Lacoste Original Parfum – Carte d’identité,Lacoste Original Parfum – Carte d’identitéLe c...,Bergamote;Cardamome;Poivre,Lavande;Lavandin;Sauge sclarée,Fève tonka;Patchouli;Santal
3,A*Men Stellar,https://www.olfastory.com/parfum/amen-stellar,Thierry Mugler,Eau de parfum,2025,Homme,Boisée,https://www.olfastory.com/sites/www.olfastory....,Eau de parfum,Aromatique;Gourmande,2025,Jacques Huclier;Louise Turner,A*Men Stellar : la nouvelle détonation gourman...,A*Men Stellar : la nouvelle détonation gourman...,Bergamote;Lavande,Pistache,Notes Boisées
4,Code Elixir,https://www.olfastory.com/parfum/code-elixir,Armani,Eau de parfum,2025,Homme,Orientale,https://www.olfastory.com/sites/www.olfastory....,Eau de parfum,Ambrée;Cuir;Epicée,2025,Inconnu,Armani Code Elixir : la nouvelle décharge d’in...,Armani Code Elixir : la nouvelle décharge d’in...,Mandarine,Cuir,Fève tonka


In [10]:
df_homme = df[df["gender"].str.capitalize() == "Homme"]
df_femme = df[df["gender"].str.capitalize() == "Femme"]
df_mixte = df[df["gender"].str.capitalize() == "Mixte"]

    # Print summary of filtered data
print(f"Nombre de parfums pour Homme : {len(df_homme)}")
print(f"Nombre de parfums pour Femme : {len(df_femme)}")
print(f"Nombre de parfums pour Mixte : {len(df_mixte)}")

    # Create new Excel file with separate sheets
with pd.ExcelWriter("parfums_par_genre.xlsx", engine="openpyxl") as writer:
    df_homme.to_excel(writer, sheet_name="Homme", index=False)
    df_femme.to_excel(writer, sheet_name="Femme", index=False)
    df_mixte.to_excel(writer, sheet_name="Mixte", index=False)

Nombre de parfums pour Homme : 582
Nombre de parfums pour Femme : 1192
Nombre de parfums pour Mixte : 277


In [11]:
df.to_json("parfums.json", orient="records", indent=2)
df_homme.to_json("parfums_homme.json", orient="records", indent=2)
df_femme.to_json("parfums_femme.json", orient="records", indent=2)
df_mixte.to_json("parfums_mixte.json", orient="records", indent=2)

In [16]:
df.family.unique()

array(['Aromatique', 'Fougère', 'Boisée', 'Orientale', 'Hespéridée',
       'Fleurie', 'Chyprée', ''], dtype=object)

In [17]:
df.columns

Index(['name', 'perfume_url', 'brand', 'type', 'year', 'gender', 'family',
       'image_url', 'concentration', 'facettes', 'date_creation', 'parfumeurs',
       'description_title', 'description_text', 'notes_tete', 'notes_coeur',
       'notes_fond'],
      dtype='object')

In [18]:
unique_scents = set(df[['notes_tete', 'notes_coeur', 'notes_fond']]
                    .stack()  # Stack to combine all values into a single Series
                    .str.split(';', expand=True)  # Split by semicolon
                    .stack()  # Stack again to get individual scents
                    .str.strip()  # Remove any leading/trailing whitespace
                    .unique())  # Get unique values

# Convert to list if needed
unique_scents = list(unique_scents)
print(unique_scents)

['Ambre', "Mousse d'arbre", 'Fleur de Citronnier', 'Banane', 'Baies de genévrier', 'Bigarade', 'Orchidée', 'Cascarille', 'Muscs blancs', 'Gentiane', 'Note aqueuse', "Feuilles d'Ajonc", 'Bergamote', 'Epices', 'Framboise', 'Glycine', 'Champagne Rosé', 'Evernyl', 'Gardénia', 'Gurjan', 'Notes fumées', 'Orange', 'Caramel', 'Mûre', 'Mangue', 'Eau de Rose', 'Tubéreuse', 'Akigalawood®', 'Tabac', 'Oliban', 'Cuir', 'Cannabis', 'Cyclamen', 'Estragon', 'Laurier', 'Pain', 'Poivre', 'Palo Santo', 'Silex', 'Thym', 'Amande amère', 'Immortelle', 'Nashi pear', 'Safran', 'Petitgrain', 'Dahlia', 'Freesia', 'Macchiato', 'Piña Colada', "Fleur d'Oranger", 'Buchu', 'Dragée', 'Cyprès', 'Hédione', 'Cerise', 'Ambroxan', "Cire d'abeille", 'Popcorn', 'Notes de Pluie', 'Poivre Rose', 'Chocolat', 'Baies roses', 'Eau de mer', 'Concombre', 'Praline', 'Ebene', 'Romarin', 'Galbanum', 'Maninka', 'Cypriol', 'Accord aromatique', 'Hibiscus', 'Poudre à canon', 'Bellini', 'Costus', 'Fève tonka', 'Bois', 'Cassie', 'Algue', 'Pi

In [23]:
len(unique_scents)

336

In [21]:
scent_counts = (df[['notes_tete', 'notes_coeur', 'notes_fond']]
                .stack()  # Combine all values into a single Series
                .str.split(';', expand=True)  # Split by semicolon
                .stack()  # Get individual scents
                .str.strip()  # Remove whitespace
                .value_counts()  # Count occurrences of each scent
                )

# Convert to a sorted list of (scent, count) tuples (optional)
sorted_scents = scent_counts.reset_index().values.tolist()
print(sorted_scents)


[['Bergamote', 712], ['Vanille', 690], ['Jasmin', 679], ['Patchouli', 653], ['Rose', 642], ['Muscs blancs', 547], ['Cèdre', 487], ['Mandarine', 443], ['Santal', 440], ['Citron', 383], ['Vétiver', 355], ["Fleur d'Oranger", 347], ['Ambre', 272], ['Iris', 266], ['Fève tonka', 251], ['Musc', 250], ['Lavande', 240], ['Pamplemousse', 198], ['Néroli', 173], ['Orange', 170], ['Bois ambrés', 168], ['Pêche', 163], ['Cardamome', 161], ['Poire', 153], ['Ylang-Ylang', 147], ['Cassis', 145], ['Mousse de chêne', 144], ['Benjoin', 142], ['Géranium', 135], ['Poivre noir', 133], ['Menthe', 131], ['Tubéreuse', 131], ['Note verte', 130], ['Cuir', 125], ['Violette', 123], ['Sauge sclarée', 122], ['Gingembre', 122], ['Muguet', 122], ['Pomme', 120], ['Pivoine', 116], ['Framboise', 111], ['Notes Boisées', 110], ['Baies roses', 108], ['Note marine', 105], ['Cannelle', 103], ['Aldéhydes', 101], ['Caramel', 101], ['Encens', 99], ['Poivre Rose', 94], ['Labdanum', 91], ['Héliotrope', 91], ['Freesia', 88], ['Corian

In [22]:

# Or, print as a DataFrame with scents and counts
print(scent_counts.reset_index(name='count').rename(columns={'index': 'scent'}))

                 scent  count
0            Bergamote    712
1              Vanille    690
2               Jasmin    679
3            Patchouli    653
4                 Rose    642
..                 ...    ...
331  Fleurs de palmier      1
332         Paradisone      1
333          Hortensia      1
334            Calamus      1
335         Cascarille      1

[336 rows x 2 columns]
