In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.figure_factory as ff
import unicodedata
from collections import Counter
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

: 

In [12]:
# 1. Load Data
df = pd.read_csv('parfums.csv')

# 2. Data Cleaning
def clean_type(row):
    type_value = str(row['type']).strip()
    if type_value.endswith('-'):
        type_value = type_value[:-1].strip()
    if type_value.startswith('-'):
        try:
            year = int(type_value[1:].strip())
            row['year'] = str(year)
            type_value = 'Inconnu'
        except ValueError:
            type_value = 'Inconnu'
    return type_value

df['type'] = df.apply(clean_type, axis=1)

def normalize_string(s):
    if pd.isnull(s):
        return ''
    return ''.join(c for c in unicodedata.normalize('NFD', str(s).lower()) if unicodedata.category(c) != 'Mn').strip()

for col in ['brand', 'type', 'gender', 'family', 'facettes', 'notes_tete', 'notes_coeur', 'notes_fond']:
    df[col] = df[col].astype(str).apply(normalize_string)

df.replace({'inconnu': np.nan, 'nan': np.nan}, inplace=True)

In [None]:
df.type.unique()

# 3 Basic Exploration

In [None]:
df.head()

In [None]:
df.info()


In [None]:
df.describe(include='all')

In [None]:
print(f"Total perfumes: {len(df)}")
print(f"Unique brands: {df['brand'].nunique()}")
print(f"Unique types: {df['type'].nunique()}")
print(f"Unique genders: {df['gender'].nunique()}")
print(f"Unique families: {df['family'].nunique()}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print('\nMissing values per column:')
print(df.isnull().sum())

# 4. Distributions & Visualizations

In [25]:
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## Perfumes by Year

In [None]:
plt.figure(figsize=(12,6))
df['year'] = pd.to_numeric(df['year'], errors='coerce')
perfumes_by_year = df.groupby('year').size()
perfumes_by_year.plot(kind='bar', color='skyblue')
plt.xlabel('Year')
plt.ylabel('Number of Perfumes')
plt.title('Number of Perfumes by Year')
plt.tight_layout()
plt.show()

## Perfumes by Gender

In [None]:
plt.figure(figsize=(6,6))
df['gender'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['lightcoral', 'lightskyblue', 'lightgreen'])
plt.title('Distribution of Perfumes by Gender')
plt.ylabel('')
plt.tight_layout()
plt.show()

## Perfumes by Brand (Top 20)


In [None]:
plt.figure(figsize=(14,6))
top_brands = df['brand'].value_counts().head(20)
top_brands.plot(kind='bar')
plt.title('Top 20 Brands by Number of Perfumes')
plt.xlabel('Brand')
plt.ylabel('Number of Perfumes')
plt.tight_layout()
plt.show()

## Perfumes by Family


In [None]:
plt.figure(figsize=(12,6))
df['family'].value_counts().plot(kind='bar')
plt.title('Perfumes by Olfactory Family')
plt.xlabel('Family')
plt.ylabel('Number of Perfumes')
plt.tight_layout()
plt.show()

## Perfumes by Type


In [None]:
plt.figure(figsize=(12,6))
df['type'].value_counts().plot(kind='bar')
plt.title('Perfumes by Type')
plt.xlabel('Type')
plt.ylabel('Number of Perfumes')
plt.tight_layout()
plt.show()

# 5. Olfactory Notes Analysis

In [31]:
def split_and_count(column):
    all_values = []
    for entry in df[column].dropna():
        values = [val.strip().lower() for val in entry.split(';') if val.strip()]
        all_values.extend(values)
    return Counter(all_values)

facettes_counts = split_and_count('facettes')
notes_tete_counts = split_and_count('notes_tete')
notes_coeur_counts = split_and_count('notes_coeur')
notes_fond_counts = split_and_count('notes_fond')

### Word Clouds


In [None]:
for counts, title in zip(
    [facettes_counts, notes_tete_counts, notes_coeur_counts, notes_fond_counts],
    ['Facettes', 'Top Notes', 'Heart Notes', 'Base Notes']):
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate_from_frequencies(counts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud of {title}')
    plt.tight_layout()
    plt.show()

### Bar Plots for Top Notes


In [None]:
for counts, title in zip(
    [facettes_counts, notes_tete_counts, notes_coeur_counts, notes_fond_counts],
    ['Facettes', 'Top Notes', 'Heart Notes', 'Base Notes']):
    plt.figure(figsize=(12,6))
    pd.Series(counts).sort_values(ascending=False).head(20).plot(kind='bar')
    plt.title(f'Top 20 {title}')
    plt.xlabel(title)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

# 6. Co-occurrences


In [33]:
def multi_hot_crosstab(df, row_col, multi_col):
    dummies = df[multi_col].str.get_dummies(sep=';')
    dummies = dummies.applymap(lambda x: 1 if x > 0 else 0)
    dummies.index = df[row_col]
    return dummies.groupby(dummies.index).sum()

def plot_interactive_heatmap(pivot_df, row_label, col_label, title, colorscale='YlGnBu'):
    fig = ff.create_annotated_heatmap(
        z=pivot_df.values,
        x=list(pivot_df.columns),
        y=list(pivot_df.index),
        colorscale=colorscale,
        showscale=True,
        annotation_text=pivot_df.values.astype(str)
    )
    fig.update_layout(title_text=title, title_font_size=22)
    fig.show()

### Family vs Facettes


In [None]:
pivot_fam_fac = multi_hot_crosstab(df, 'family', 'facettes')
plot_interactive_heatmap(pivot_fam_fac, 'Family', 'Facette', 'Family vs Facettes')


### Family vs Top Notes


In [None]:
pivot_fam_note_tete = multi_hot_crosstab(df, 'family', 'notes_tete')
plot_interactive_heatmap(pivot_fam_note_tete, 'Family', 'Top Note', 'Family vs Top Notes')

### Family vs Heart Notes

In [None]:
pivot_fam_note_coeur = multi_hot_crosstab(df, 'family', 'notes_coeur')
plot_interactive_heatmap(pivot_fam_note_coeur, 'Family', 'Heart Note', 'Family vs Heart Notes')


### Family vs Base Notes


In [None]:
pivot_fam_note_fond = multi_hot_crosstab(df, 'family', 'notes_fond')
plot_interactive_heatmap(pivot_fam_note_fond, 'Family', 'Base Note', 'Family vs Base Notes')

In [38]:
# 7. Stacked Bar Charts: Notes by Gender and Family
def get_note_distribution(note_column, group_column):
    data = []
    groups = df[group_column].dropna().unique()
    for group in groups:
        group_df = df[df[group_column] == group]
        counts = split_and_count(note_column)
        top_notes = [k for k, v in counts.most_common(5)]
        group_counts = {note: 0 for note in top_notes}
        for entry in group_df[note_column].dropna():
            notes = [n.strip().lower() for n in entry.split(';')]
            for note in notes:
                if note in top_notes:
                    group_counts[note] += 1
        data.append([group_counts[note] for note in top_notes])
    return pd.DataFrame(data, index=groups, columns=top_notes)


### Notes by Gender


In [None]:
notes_tete_gender = get_note_distribution('notes_tete', 'gender')
notes_tete_gender.plot(kind='bar', stacked=True, figsize=(12,6))
plt.title('Top Notes Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Top Notes')
plt.tight_layout()
plt.show()

### Notes by Family

In [None]:
notes_tete_family = get_note_distribution('notes_tete', 'family')
notes_tete_family.plot(kind='bar', stacked=True, figsize=(12,6))
plt.title('Top Notes Distribution by Family')
plt.xlabel('Olfactory Family')
plt.ylabel('Count')
plt.legend(title='Top Notes')
plt.tight_layout()
plt.show()


# 8. Trends Over Time

In [None]:
year_type_counts = df.groupby(['year', 'type']).size().unstack().fillna(0)
year_family_counts = df.groupby(['year', 'family']).size().unstack().fillna(0)
# Limit to 2000-2025 if possible
year_type_counts = year_type_counts.loc[year_type_counts.index.notnull()]
year_family_counts = year_family_counts.loc[year_family_counts.index.notnull()]

plt.figure(figsize=(12, 6))
for column in year_type_counts.columns[:5]:
    plt.plot(year_type_counts.index, year_type_counts[column], marker='o', label=column)
plt.title('Perfume Types Over Time')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Type')
plt.tight_layout()
plt.show()


In [None]:

plt.figure(figsize=(12, 6))
for column in year_family_counts.columns[:5]:
    plt.plot(year_family_counts.index, year_family_counts[column], marker='o', label=column)
plt.title('Perfume Families Over Time')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Family')
plt.tight_layout()
plt.show()


# 9. Summary & Insights


In [None]:
print('\n=== Summary & Insights ===')
print('Most common olfactory families:', df['family'].value_counts().head(5).to_dict())
print('Most common facettes:', dict(facettes_counts.most_common(5)))
print('Most common top notes:', dict(notes_tete_counts.most_common(5)))
print('Most common heart notes:', dict(notes_coeur_counts.most_common(5)))
print('Most common base notes:', dict(notes_fond_counts.most_common(5)))
print('Gender distribution:', df['gender'].value_counts().to_dict())
print('Brand distribution (top 5):', df['brand'].value_counts().head(5).to_dict())
print('Type distribution:', df['type'].value_counts().to_dict())
print('Year range:', df['year'].min(), '-', df['year'].max()) 