In [1]:
import pandas as pd

df = pd.read_csv("parfums.csv")

In [None]:
#clean ds
def update_type_and_year(row):
    if row['type'].startswith('- '):
        year = row['type'][2:]
        return pd.Series(['Inconnu', year])

    else:
        type_value = row['type'].rstrip(' -')
        return pd.Series([type_value, row.year])
# Apply the function to the DataFrame
df[['type', 'year']] = df.apply(update_type_and_year, axis=1)
df.type.unique()

In [None]:
df.columns

In [None]:
import matplotlib.pyplot as plt

# Group by year and count the number of perfumes
perfumes_by_year = df.groupby('year').size()

# Create a bar plot
perfumes_by_year.plot(kind='bar', color='skyblue')

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Number of Perfumes')
plt.title('Number of Perfumes by Year')

# Show the plot
plt.show()

In [None]:
# Group by gender and count the number of perfumes
perfumes_by_gender = df['gender'].value_counts()

# Create a pie chart
perfumes_by_gender.plot(kind='pie', autopct='%1.1f%%', colors=['lightcoral', 'lightskyblue', 'lightgreen'])

# Add title
plt.title('Distribution of Perfumes by Gender')

# Show the plot
plt.show()

In [None]:
import plotly.express as px

# Create a bar plot
fig = px.bar(df, x='brand', y='name', title='Number of Perfumes by Brand')

# Show the plot
fig.show()

In [7]:
# --- Cell 1: Imports and Data Loading ---
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import plotly.express as px

# Load data
df = pd.read_csv('parfums.csv')
def clean_type(row):
    type_value = str(row['type']).strip()
    
    # Remove trailing '-'
    if type_value.endswith('-'):
        type_value = type_value[:-1].strip()
    
    # If starts with '-', move to year and set type to 'inconnu'
    if type_value.startswith('-'):
        # Extract the year (assuming it's a 4-digit number)
        try:
            year = int(type_value[1:].strip())
            row['year'] = str(year)
            type_value = 'Inconnu'
        except ValueError:
            # If not a valid year, just set type to 'inconnu'
            type_value = 'Inconnu'
    
    return type_value

# Apply the cleaning function to the 'type' column
df['type'] = df.apply(clean_type, axis=1)

In [8]:
def multi_hot_crosstab(df, row_col, multi_col):
    dummies = df[multi_col].str.get_dummies(sep=';')
    dummies = dummies.applymap(lambda x: 1 if x > 0 else 0)
    dummies.index = df[row_col]
    return dummies.groupby(dummies.index).sum()

def describe_heatmap(pivot_df, row_label, col_label, n_most_common=3, n_top_cols=3):
    """
    Prints a markdown-style summary of the most common row-column combinations and top columns overall in a heatmap DataFrame.
    Args:
        pivot_df: DataFrame (rows x columns) with counts
        row_label: str, label for rows (e.g. 'Family')
        col_label: str, label for columns (e.g. 'Facette')
        n_most_common: int, number of most common combinations to show
        n_top_cols: int, number of top columns overall to show
    """
    flat = [
        (pivot_df.index[i], pivot_df.columns[j], pivot_df.values[i, j])
        for i in range(pivot_df.shape[0])
        for j in range(pivot_df.shape[1])
        if pivot_df.values[i, j] > 0
    ]
    flat_sorted = sorted(flat, key=lambda x: (-x[2], str(x[0]), str(x[1])))
    most_common = flat_sorted[:n_most_common]
    print(f"#### Analysis: {row_label} vs {col_label}")
    print(f"The {n_most_common} most frequent {row_label.lower()}–{col_label.lower()} combinations:")
    for fam, fac, count in most_common:
        print(f"- {row_label}: {fam}, {col_label}: {fac} (Count: {count})")
    top_cols = pivot_df.sum(axis=0).sort_values(ascending=False).head(n_top_cols)
    print(f"Top {n_top_cols} {col_label.lower()}s across all {row_label.lower()}s: {', '.join(top_cols.index)}")

In [9]:

gender_map = {'Homme': 'Homme', 'Femme': 'Femme', 'Mixte': 'Mixte'}
df_all = df.copy()
df_homme = df[df['gender'] == 'Homme']
df_femme = df[df['gender'] == 'Femme']
df_mixte = df[df['gender'] == 'Mixte']
gender_dfs = [
    ('All', df_all),
    ('Homme', df_homme),
    ('Femme', df_femme),
    ('Mixte', df_mixte)
]

In [10]:
def plot_interactive_heatmap(pivot_df, row_label, col_label, title, colorscale='YlGnBu'):
    fig = ff.create_annotated_heatmap(
        z=pivot_df.values,
        x=list(pivot_df.columns),
        y=list(pivot_df.index),
        colorscale=colorscale,
        showscale=True,
        annotation_text=pivot_df.values.astype(str)
    )
    fig.update_layout(title_text=title, title_font_size=22)
    fig.show()


named_colorscales = px.colors.named_colorscales()

In [11]:
def plot_ds_analysis(gender_name, gender_df):
    colorscale = np.random.choice(named_colorscales)
    print(f"\n### Family vs Facettes (Interactive Heatmap) with colorscale: {colorscale}")
    pivot_fam_fac = multi_hot_crosstab(gender_df, 'family', 'facettes')
    plot_interactive_heatmap(pivot_fam_fac, 'Family', 'Facette', f'{gender_name}: Family vs Facettes', colorscale=colorscale)
    describe_heatmap(pivot_fam_fac, 'Family', 'Facette', n_most_common=3, n_top_cols=3)
    # Family vs Top Notes
    print(f"\n### Family vs Top Notes (Interactive Heatmap)")
    pivot_fam_note_tete = multi_hot_crosstab(gender_df, 'family', 'notes_tete')
    plot_interactive_heatmap(pivot_fam_note_tete, 'Family', 'Top Note', f'{gender_name}: Family vs Top Notes', colorscale=colorscale)
    describe_heatmap(pivot_fam_note_tete, 'Family', 'Top Note', n_most_common=3, n_top_cols=3)
    # Family vs Heart Notes
    print(f"\n### Family vs Heart Notes (Interactive Heatmap)")
    pivot_fam_note_coeur = multi_hot_crosstab(gender_df, 'family', 'notes_coeur')
    plot_interactive_heatmap(pivot_fam_note_coeur, 'Family', 'Heart Note', f'{gender_name}: Family vs Heart Notes', colorscale=colorscale)
    describe_heatmap(pivot_fam_note_coeur, 'Family', 'Heart Note', n_most_common=3, n_top_cols=3)
    # Family vs Base Notes
    print(f"\n### Family vs Base Notes (Interactive Heatmap)")
    pivot_fam_note_fond = multi_hot_crosstab(gender_df, 'family', 'notes_fond')
    plot_interactive_heatmap(pivot_fam_note_fond, 'Family', 'Base Note', f'{gender_name}: Family vs Base Notes', colorscale)
    describe_heatmap(pivot_fam_note_fond, 'Family', 'Base Note', n_most_common=3, n_top_cols=3)

In [None]:
plot_ds_analysis('All', df_all)

In [None]:
plot_ds_analysis('Hommes', df_homme)

In [None]:
plot_ds_analysis('Femmes', df_femme)

In [None]:
plot_ds_analysis('Mixte', df_mixte)