# \[Kaggle\] Perfumes Visualization

SEOYEON CHOI  
2025-09-03

# Reference

-   [data](https://www.kaggle.com/datasets/ayushghawana/perfume-dataset?resource=download)

# Import

In [59]:
import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px

# Data

In [60]:
df = pd.read_csv('../../../delete/Perfumes_dataset.csv').query("target_audience!='Target Audience'")

-   category가 `Fruity Sweet Gourmand` 이고 target_audience가 unisex임

In [61]:
df.at[203, "category"] = "Fruity Sweet Gourmand"
df.at[203, "target_audience"] = "Unisex"

-   성별 통일

In [62]:
mapping = {
    "Female": "Female",
    "Women": "Female",
    "Male": "Male",
    "Men": "Male",
    "Unisex": "Unisex"
}

df["target_audience"] = df["target_audience"].map(mapping)

-   category 단어 구별

In [63]:
split_cols = df["category"].str.split(" ", expand=True)
df["cat1"] = split_cols[0]
df["cat2"] = split_cols[1]
df["cat3"] = split_cols[2]

-   longevity 통일

In [64]:
mapping = {
    "Strong": "Strong",
    "Medium": "Medium",
    "Medium–Strong ": "Medium–Strong",
    "Medium ": "Medium",
    "Strong ": "Strong",
    "Very Strong": "Very Strong",
    "Light": "Light",
    "Light–Medium ": "Light–Medium",
    "Light–Medium": "Light–Medium",
    "6–8 hours": "Light–Medium"
}

df['longevity_level'] = df["longevity"].str.split(":", expand=True)[0]

df["longevity_level"] = df["longevity_level"].map(mapping)
df.head()

-   type 통일

In [65]:
df.type.unique()

array(['edp', 'edt', 'parfum', 'EDP', 'Extrait de Parfum', 'Parfum',
       'EDT', 'Extrait', 'Cologne', 'Alcohol-free', 'Attar',
       'Concentrate', 'Oil'], dtype=object)

| Raw 값 예시                                   | 표준화 값                 | 설명                                                              |
|------------------------------|-----------------|-------------------------|
| `edp`, `EDP`                                  | **EDP (Eau de Parfum)**   | 가장 흔한 형태 (15–20%, 5–8시간 지속)                             |
| `edt`, `EDT`                                  | **EDT (Eau de Toilette)** | 가볍고 산뜻 (5–15%, 3–5시간)                                      |
| `parfum`, `Parfum`                            | **Parfum**                | 진하고 오래감 (20–30%, 8–12시간)                                  |
| `Extrait de Parfum`, `Extrait`                | **Extrait**               | 최고 농도 (30% 이상, 12시간+)                                     |
| `Cologne`                                     | **EDC (Eau de Cologne)**  | 가볍고 빠르게 사라짐 (2–5%, 1–3시간)                              |
| `Alcohol-free`, `Oil`, `Attar`, `Concentrate` | **Oil/Attar**             | 알코올 없는 오일 기반 향수, 지속력 강하지만 확산력은 약할 수 있음 |

In [67]:
mapping = {
    "edp": "EDP",
    "EDP": "EDP",
    "edt": "EDT",
    "EDT": "EDT",
    "parfum": "Parfum",
    "Parfum": "Parfum",
    "Extrait de Parfum": "Extrait",
    "Extrait": "Extrait",
    "Cologne": "EDC",
    "Alcohol-free": "Oil/Attar",
    "Attar": "Oil/Attar",
    "Concentrate": "Oil/Attar",
    "Oil": "Oil/Attar"
}

df["type_standardized"] = df["type"].map(mapping)
df.head()

-   brand pie chart를 plotly로 만들자
-   target_audience pie chart를 plotly로 만들자
-   category별 target 구분
-   type별 longevity구분

# Brand 상위 20 Ratio

-   개수 카운트

In [68]:
brand_counts = df["brand"].value_counts()
brand_counts.head()

Jean Paul Gaultier    94
paris corner          77
armaf                 70
Al Haramain           43
fragrance world       42
Name: brand, dtype: int64

-   상위 20개

In [69]:
top20_brands = brand_counts.nlargest(20).index
top20_brands

Index(['Jean Paul Gaultier', 'paris corner', 'armaf', 'Al Haramain',
       'fragrance world', 'Lattafa', 'Azzaro', 'Hugo Boss', 'Giorgio Armani',
       'Afnan', 'Dior', 'Ajmal', 'Hermès', 'Prada', 'Maison Alhambra',
       'Louis Vuitton', 'Creed', 'Victoria's Secret', 'Carolina Herrera',
       'Dolce & Gabbana'],
      dtype='object')

-   재그룹화된 열 합치기

In [70]:
df["brand_grouped"] = df["brand"].where(df["brand"].isin(top20_brands), "Others")
df.head()

-   pie chart 만들기 위한 데이터셋

In [71]:
brand_grouped_counts = df["brand_grouped"].value_counts().reset_index()
brand_grouped_counts.columns = ["brand", "count"]

In [72]:
fig = px.pie(
    brand_grouped_counts,
    names="brand",
    values="count",
    title="Top 20 Brands (Others grouped)",
    width=600, height=600,
    template='seaborn'
)
fig.show()

-   브랜드는 고르게 분포

# 성별 비율

In [73]:
target_audience_counts = df["target_audience"].value_counts().reset_index()
target_audience_counts.columns = ["target_audience", "count"]
# target_audience_counts = target_audience_counts.query('count!=1')

In [74]:
fig = px.pie(
    target_audience_counts,
    names="target_audience",
    values="count",
    title="target_audience",
    width=600, height=600,
    template='seaborn'
)
fig.show()

-   비율 동일

# category 별 타겟 구분

## 1st

In [75]:
df["cat1"].value_counts().nlargest(10)

Woody         260
Floriental    136
Oriental      114
Amber          96
Floral         77
Fresh          61
Aromatic       38
Unknown        38
Citrus         33
Fruity         33
Name: cat1, dtype: int64

-   상위 5개

In [76]:
top5_cat1 = df["cat1"].value_counts().nlargest(5).index
top5_cat1

Index(['Woody', 'Floriental', 'Oriental', 'Amber', 'Floral'], dtype='object')

-   데이터셋에서 그 5개에 해당하는 것만 가져오기

In [77]:
df_top5 = df[df["cat1"].isin(top5_cat1)]

-   집계

In [78]:
counts = df_top5.groupby(["target_audience", "cat1"]).size().reset_index(name="count")

-   bar chart

In [79]:
fig = px.bar(
    counts,
    x="target_audience",
    y="count",
    color="cat1",
    barmode="group",
    title="Top 5 category(1st) by Target Audience",
    width=1000, height=600,
    template='seaborn'
)
fig.show()

-   남성은 woody향을 내세우고
-   여성은 floriental 향을 내세우는군
-   두 개는 성별로 약간 극단적

## 2nd

In [80]:
df["cat2"].value_counts().nlargest(10)

Spicy       200
Floral      174
Aromatic     94
Fruity       78
Woody        44
Fougere      34
Scent        25
Oriental     25
Aquatic      20
Gourmand     16
Name: cat2, dtype: int64

-   상위 5개

In [81]:
top5_cat2 = df["cat2"].value_counts().nlargest(5).index
top5_cat2

Index(['Spicy', 'Floral', 'Aromatic', 'Fruity', 'Woody'], dtype='object')

-   데이터셋에서 그 5개에 해당하는 것만 가져오기

In [82]:
df_top5 = df[df["cat2"].isin(top5_cat2)]

-   집계

In [83]:
counts = df_top5.groupby(["target_audience", "cat2"]).size().reset_index(name="count")

-   bar chart

In [84]:
fig = px.bar(
    counts,
    x="target_audience",
    y="count",
    color="cat2",
    barmode="group",
    title="Top 5 category(2nd) by Target Audience",
    width=1000, height=600,
    template='seaborn'
)
fig.show()

-   남성은 spicy향 아주 내세우고
-   여성은 floral향
-   이것도 성별로 극단적

# type 별 구분

In [128]:
df.type_standardized.value_counts()

EDP          764
EDT          161
Parfum        39
Extrait       21
EDC           11
Oil/Attar      7
Name: type_standardized, dtype: int64

In [119]:
order = ["Light","Light–Medium","Medium","Medium–Strong","Strong","Very Strong"]
by_type = df.groupby(["type_standardized","longevity_level"]).size().reset_index(name="count")
type_order = by_type.groupby("type_standardized")["count"].sum().sort_values(ascending=False).index

In [116]:
custom_colors = {
    "Light": "#a6cee3",
    "Light–Medium": "#1f78b4",
    "Medium": "#33a02c",
    "Medium–Strong": "#fb9a99",
    "Strong": "#e31a1c",
    "Very Strong": "#6a3d9a"
}

fig = px.bar(
    by_type, 
    x="type_standardized", 
    y="count", 
    color="longevity_level",
    barmode="stack",
    category_orders={"type_standardized": list(type_order), "longevity_level": order},
    title="Longevity Levels by Type (counts)",
    width=1000, height=800,
    template='seaborn',
    color_discrete_map=custom_colors  # 직접 매핑
)

fig.show()

| Raw 값 예시                                   | 표준화 값                 | 설명                                                              |
|------------------------------|-----------------|-------------------------|
| `edp`, `EDP`                                  | **EDP (Eau de Parfum)**   | 가장 흔한 형태 (15–20%, 5–8시간 지속)                             |
| `edt`, `EDT`                                  | **EDT (Eau de Toilette)** | 가볍고 산뜻 (5–15%, 3–5시간)                                      |
| `parfum`, `Parfum`                            | **Parfum**                | 진하고 오래감 (20–30%, 8–12시간)                                  |
| `Extrait de Parfum`, `Extrait`                | **Extrait**               | 최고 농도 (30% 이상, 12시간+)                                     |
| `Cologne`                                     | **EDC (Eau de Cologne)**  | 가볍고 빠르게 사라짐 (2–5%, 1–3시간)                              |
| `Alcohol-free`, `Oil`, `Attar`, `Concentrate` | **Oil/Attar**             | 알코올 없는 오일 기반 향수, 지속력 강하지만 확산력은 약할 수 있음 |

-   EDP만

In [124]:
fig = px.pie(
    by_type.query("type_standardized == 'EDP'"),
    names="longevity_level",
    values="count",
    title="EDP",
    width=600, height=600,
    template='seaborn',
    color="longevity_level",                # 색 기준 열 지정
    color_discrete_map=custom_colors        # 색 매핑
)
fig.show()


-   edp는 medium이 많고 그다음 strong

# 성별 longevity level

In [129]:
df.target_audience.value_counts()

Unisex    376
Female    331
Male      296
Name: target_audience, dtype: int64

In [125]:
# 집계
by_aud = df.groupby(["target_audience","longevity_level"]).size().reset_index(name="count")

# 각 target_audience별 총합
by_aud["percent"] = by_aud.groupby("target_audience")["count"].transform(lambda x: x / x.sum() * 100)

# bar chart (percent 사용)
fig = px.bar(
    by_aud,
    x="target_audience",
    y="percent",
    color="longevity_level",
    barmode="stack",
    category_orders={"longevity_level": ["Light","Light–Medium","Medium","Medium–Strong","Strong","Very Strong"]},
    title="Longevity distribution by Target Audience (%)",
    width=800, height=600,
    template='seaborn',
    color_discrete_map=custom_colors 
)
fig.show()
