In [2]:
import pandas as pd
from collections import Counter
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
from itertools import combinations
import uuid
import os
df = pd.read_csv("./parfums.csv")

In [None]:
df.type.unique()

In [4]:
def clean_type(row):
    type_value = str(row['type']).strip()
    
    # Remove trailing '-'
    if type_value.endswith('-'):
        type_value = type_value[:-1].strip()
    
    # If starts with '-', move to year and set type to 'inconnu'
    if type_value.startswith('-'):
        # Extract the year (assuming it's a 4-digit number)
        try:
            year = int(type_value[1:].strip())
            row['year'] = str(year)
            type_value = 'Inconnu'
        except ValueError:
            # If not a valid year, just set type to 'inconnu'
            type_value = 'Inconnu'
    
    return type_value

# Apply the cleaning function to the 'type' column
df['type'] = df.apply(clean_type, axis=1)

In [None]:
df.type.unique()

In [None]:
df.year.unique()

In [None]:
df.head()

In [None]:
# --- Statistical Summary ---
print("=== Statistical Summary ===")
print(f"Total perfumes: {len(df)}")
print(f"Unique brands: {df['brand'].nunique()}")
print(f"Unique types: {df['type'].nunique()}")
print(f"Unique genders: {df['gender'].nunique()}")
print(f"Unique families: {df['family'].nunique()}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print("\nMissing values per column:")
print(df.isnull().sum())

# --- Helper Functions ---
def split_and_count(column):
    """Split semicolon-separated values and count frequencies."""
    all_values = []
    for entry in df[column].dropna():
        values = [val.strip().lower() for val in entry.split(';')]
        all_values.extend(values)
    return Counter(all_values)

def get_co_occurrences(column1, column2):
    """Count co-occurrences of values between two columns."""
    co_occ = Counter()
    for i, row in df.dropna(subset=[column1, column2]).iterrows():
        values1 = [val.strip().lower() for val in row[column1].split(';')]
        values2 = [val.strip().lower() for val in row[column2].split(';')]
        for v1, v2 in combinations(values1 + values2, 2):
            co_occ[(v1, v2)] += 1
    return co_occ

# --- Olfactory Profile Analysis ---
facettes_counts = split_and_count('facettes')
notes_tete_counts = split_and_count('notes_tete')
notes_coeur_counts = split_and_count('notes_coeur')
notes_fond_counts = split_and_count('notes_fond')

# Co-occurrence of facettes and family
facettes_family_co = get_co_occurrences('facettes', 'family')
print("\n=== Top 5 Facettes-Family Co-occurrences ===")
print(pd.Series(facettes_family_co).nlargest(5))


In [None]:
plt.style.available

In [None]:

# --- Visualizations ---
# Set style for better visuals
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# 1. Word Clouds
def generate_wordcloud(counts, title, filename):
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate_from_frequencies(counts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()


In [None]:

generate_wordcloud(facettes_counts, 'Word Cloud of Facettes', 'facettes_wordcloud')
generate_wordcloud(notes_tete_counts, 'Word Cloud of Top Notes', 'notes_tete_wordcloud')
generate_wordcloud(notes_coeur_counts, 'Word Cloud of Heart Notes', 'notes_coeur_wordcloud')
generate_wordcloud(notes_fond_counts, 'Word Cloud of Base Notes', 'notes_fond_wordcloud')


In [None]:
facettes_counts.items(), Counter(df['family'].dropna()).items()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import unicodedata

# Normalize strings to remove accents and standardize case
def normalize_string(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s.lower()) if unicodedata.category(c) != 'Mn')

# Assuming df, facettes_counts, and facettes_family_co are defined
# Example facettes_counts and family_counts from your input
facettes_counts = dict([('marine', 78), ('boisée', 296), ('ambrée', 122), ('epicée', 236), ('aromatique', 234), 
                        ('gourmande', 201), ('cuir', 69), ('musquée', 161), ('fougère', 75), ('poudrée', 118), 
                        ('inconnu', 26), ('hespéridée', 55), ('fleurie', 384), ('orientale', 56), ('fruitée', 500), 
                        ('verte', 139), ('chyprée', 43), ('nouvelle fraîcheur', 53), ('aldéhydée', 39)])
family_counts = dict([('Aromatique', 128), ('Fougère', 61), ('Boisée', 307), ('Orientale', 527), 
                      ('Hespéridée', 166), ('Fleurie', 739), ('Chyprée', 112)])

# Create lists of top facettes and families
top_facettes = list(facettes_counts.keys())
top_families = list(family_counts.keys())

# Initialize heatmap data with zeros
heatmap_data = pd.DataFrame(0, index=top_facettes, columns=top_families)

# Assuming facettes_family_co is a dictionary with (facette, family) keys and counts
# Normalize keys in facettes_family_co and populate heatmap
for (facette, family), count in facettes_family_co.items():
    norm_facette = normalize_string(facette)
    norm_family = normalize_string(family)
    # Find matching facette and family in top lists
    for f in top_facettes:
        if normalize_string(f) == norm_facette:
            for fam in top_families:
                if normalize_string(fam) == norm_family:
                    heatmap_data.loc[f, fam] = count
                    break
            break

# Create and customize the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu', cbar_kws={'label': 'Count'})
plt.title('Heatmap of Facettes Belonging to Olfactory Families')
plt.xlabel('Olfactory Family')
plt.ylabel('Facette')
plt.tight_layout()
plt.show()

In [None]:

# 3. Stacked Bar Chart: Notes Distribution by Gender and Family
def get_note_distribution(note_column, group_column):
    data = []
    for group in df[group_column].unique():
        group_df = df[df[group_column] == group]
        counts = split_and_count(note_column)
        top_notes = [k for k, v in counts.most_common(5)]
        group_counts = {note: 0 for note in top_notes}
        for entry in group_df[note_column].dropna():
            notes = [n.strip().lower() for n in entry.split(';')]
            for note in notes:
                if note in top_notes:
                    group_counts[note] += 1
        data.append([group_counts[note] for note in top_notes])
    return pd.DataFrame(data, index=df[group_column].unique(), columns=top_notes)

# Notes by gender
notes_tete_gender = get_note_distribution('notes_tete', 'gender')
plt.figure(figsize=(12, 6))
notes_tete_gender.plot(kind='bar', stacked=True)
plt.title('Top Notes Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Top Notes')
plt.show()

# Notes by family
notes_tete_family = get_note_distribution('notes_tete', 'family')
plt.figure(figsize=(12, 6))
notes_tete_family.plot(kind='bar', stacked=True)
plt.title('Top Notes Distribution by Family')
plt.xlabel('Olfactory Family')
plt.ylabel('Count')
plt.legend(title='Top Notes')
plt.show()

# 4. Line Plot: Trends in Types and Families Over Time
year_type_counts = df.groupby(['year', 'type']).size().unstack().fillna(0)
year_family_counts = df.groupby(['year', 'family']).size().unstack().fillna(0)
year_type_counts = year_type_counts.loc["2000":"2025"]  # Limit to 2000-2025
year_family_counts = year_family_counts.loc["2000":"2025"]

plt.figure(figsize=(12, 6))
for column in year_type_counts.columns[:5]:  # Top 5 types
    plt.plot(year_type_counts.index, year_type_counts[column], marker='o', label=column)
plt.title('Perfume Types Over Time (2000-2025)')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Type')
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))
for column in year_family_counts.columns[:5]:  # Top 5 families
    plt.plot(year_family_counts.index, year_family_counts[column], marker='o', label=column)
plt.title('Olfactory Families Over Time (2000-2025)')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Family')
plt.grid(True)
plt.show()

# 5. Box Plot: Years by Gender and Concentration
plt.figure(figsize=(12, 6))
sns.boxplot(x='gender', y='year', hue='concentration', data=df)
plt.title('Distribution of Perfume Releases by Gender and Concentration')
plt.xlabel('Gender')
plt.ylabel('Year')
plt.legend(title='Concentration')
plt.show()

print("\nVisualizations saved in 'plots' directory.")

In [None]:

# --- Enhancement 1: Missing Value Heatmap ---
import missingno as msno
plt.figure(figsize=(10, 6))
msno.heatmap(df)
plt.title('Missing Value Heatmap')
plt.show()

# --- Enhancement 2: Bar Plots for Top Facettes and Families ---
# Top 10 Facettes
facettes_top10 = dict(Counter([item for sublist in df['facettes'].dropna().str.split(';') for item in sublist]).most_common(10))
plt.figure(figsize=(10, 5))
plt.bar(facettes_top10.keys(), facettes_top10.values(), color='skyblue')
plt.title('Top 10 Facettes')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Top 10 Families
families_top10 = dict(Counter(df['family'].dropna()).most_common(10))
plt.figure(figsize=(10, 5))
plt.bar(families_top10.keys(), families_top10.values(), color='salmon')
plt.title('Top 10 Olfactory Families')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- Enhancement 3: Brand Analysis ---
# Top 10 Brands by Perfume Count
brand_counts = df['brand'].value_counts().head(10)
plt.figure(figsize=(10, 5))
brand_counts.plot(kind='bar', color='mediumseagreen')
plt.title('Top 10 Brands by Number of Perfumes')
plt.ylabel('Number of Perfumes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Distribution of Types for Top 5 Brands
top5_brands = brand_counts.index[:5]
type_brand = df[df['brand'].isin(top5_brands)].groupby(['brand', 'type']).size().unstack().fillna(0)
type_brand.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Types for Top 5 Brands')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Type')
plt.tight_layout()
plt.show()

# --- Enhancement 4: Brand Diversity Score ---
# Diversity = number of unique facettes per brand
brand_facette_diversity = df.groupby('brand')['facettes'].apply(lambda x: len(set([item.strip().lower() for sublist in x.dropna().str.split(';') for item in sublist])))
brand_facette_diversity = brand_facette_diversity.sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 5))
brand_facette_diversity.plot(kind='bar', color='orchid')
plt.title('Top 10 Brands by Facette Diversity')
plt.ylabel('Number of Unique Facettes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- Enhancement 5: (Optional) Interactive Plot Suggestion ---
print('For interactive plots, consider using Plotly:')
print('import plotly.express as px')
print('fig = px.bar(brand_counts, x=brand_counts.index, y=brand_counts.values, title="Top 10 Brands by Number of Perfumes")')
print('fig.show()')


In [None]:
import plotly.express as px
fig = px.bar(brand_counts, x=brand_counts.index, y=brand_counts.values, title="Top 10 Brands by Number of Perfumes")
fig.show()

In [None]:
# --- Co-occurrence of Facettes and Notes (all notes combined) ---
from collections import defaultdict

def get_facette_note_cooccurrence(df):
    co_occ = Counter()
    for i, row in df.iterrows():
        facettes = [f.strip().lower() for f in str(row['facettes']).split(';') if f.strip()]
        notes = []
        for col in ['notes_tete', 'notes_coeur', 'notes_fond']:
            notes += [n.strip().lower() for n in str(row[col]).split(';') if n.strip()]
        for f in facettes:
            for n in notes:
                co_occ[(f, n)] += 1
    return co_occ

facette_note_co = get_facette_note_cooccurrence(df)
top_facette_note = pd.Series(facette_note_co).nlargest(20)
print("\n=== Top 20 Facette-Note Co-occurrences ===")
print(top_facette_note)

# --- Visualize as Heatmap (Top 10 Facettes x Top 10 Notes) ---
top_facettes = [f for f, _ in Counter([k[0] for k in facette_note_co.keys()]).most_common(10)]
top_notes = [n for n, _ in Counter([k[1] for k in facette_note_co.keys()]).most_common(10)]

heatmap_data = pd.DataFrame(0, index=top_facettes, columns=top_notes)
for (f, n), count in facette_note_co.items():
    if f in top_facettes and n in top_notes:
        heatmap_data.loc[f, n] = count
        
# Get all unique facettes and notes
all_facettes = sorted(set([k[0] for k in facette_note_co.keys()]))
all_notes = sorted(set([k[1] for k in facette_note_co.keys()]))

heatmap_data = pd.DataFrame(0, index=all_facettes, columns=all_notes)
for (f, n), count in facette_note_co.items():
    heatmap_data.loc[f, n] = count

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Count'})
plt.title('Heatmap of Facettes and Notes Co-occurrence')
plt.xlabel('Note')
plt.ylabel('Facette')
plt.tight_layout()
plt.show()

In [None]:
# --- Co-occurrence of Family and Notes (all notes combined) ---
from collections import Counter

def get_family_note_cooccurrence(df):
    co_occ = Counter()
    for i, row in df.iterrows():
        families = [str(row['family']).strip().lower()] if pd.notnull(row['family']) else []
        notes = []
        for col in ['notes_tete', 'notes_coeur', 'notes_fond']:
            notes += [n.strip().lower() for n in str(row[col]).split(';') if n.strip()]
        for fam in families:
            for n in notes:
                co_occ[(fam, n)] += 1
    return co_occ

family_note_co = get_family_note_cooccurrence(df)
top_family_note = pd.Series(family_note_co).nlargest(20)
print("\n=== Top 20 Family-Note Co-occurrences ===")
print(top_family_note)

# --- Visualize as Heatmap (Top 10 Families x Top 10 Notes) ---
top_families = [f for f, _ in Counter([k[0] for k in family_note_co.keys()]).most_common(10)]
top_notes_fam = [n for n, _ in Counter([k[1] for k in family_note_co.keys()]).most_common(10)]

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

heatmap_data_fam = pd.DataFrame(0, index=top_families, columns=top_notes_fam)
for (fam, n), count in family_note_co.items():
    if fam in top_families and n in top_notes_fam:
        heatmap_data_fam.loc[fam, n] = count
        
# Get all unique families and notes
all_families = sorted(set([k[0] for k in family_note_co.keys()]))
all_notes = sorted(set([k[1] for k in family_note_co.keys()]))

heatmap_data_fam = pd.DataFrame(0, index=all_families, columns=all_notes)
for (fam, n), count in family_note_co.items():
    heatmap_data_fam.loc[fam, n] = count

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data_fam, annot=True, fmt='d', cmap='YlGnBu', cbar_kws={'label': 'Count'})
plt.title('Heatmap of Family and Notes Co-occurrence')
plt.xlabel('Note')
plt.ylabel('Family')
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

# Use your existing heatmap_data (e.g., for top 20 facettes/notes, or all if you want)
# Example for top 20:
top_facettes = [f for f, _ in Counter([k[0] for k in facette_note_co.keys()]).most_common(20)]
top_notes = [n for n, _ in Counter([k[1] for k in facette_note_co.keys()]).most_common(20)]

heatmap_data = pd.DataFrame(0, index=top_facettes, columns=top_notes)
for (f, n), count in facette_note_co.items():
    if f in top_facettes and n in top_notes:
        heatmap_data.loc[f, n] = count

# Reset index for Plotly
heatmap_data_reset = heatmap_data.reset_index().melt(id_vars='index')
heatmap_data_reset.columns = ['Facette', 'Note', 'Count']

fig = px.imshow(
    heatmap_data.values,
    labels=dict(x="Note", y="Facette", color="Count"),
    x=heatmap_data.columns,
    y=heatmap_data.index,
    aspect="auto",
    color_continuous_scale="YlOrRd"
)
fig.update_layout(title="Interactive Heatmap of Facettes and Notes Co-occurrence")
fig.show()