In [None]:
import os, sys
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from src.data_preprocessing import fit_transform_preprocess

sns.set_theme(style="whitegrid")


In [None]:
DATA_PATH = os.path.join("..","data","raw","shopping_behavior.csv")
df = pd.read_csv(DATA_PATH)

X, preprocess, num_cols, cat_cols = fit_transform_preprocess(df)

# поставь сюда best_k из прошлого ноутбука
best_k = 4

kmeans = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
df["cluster"] = kmeans.fit_predict(X)

df.head()


In [None]:
num_cols = df.select_dtypes(include="number").columns.tolist()
num_cols = [c for c in num_cols if c != "cluster"]

profile_num = df.groupby("cluster")[num_cols].mean().round(2)
display(profile_num)


In [None]:
# попробуем угадать частые названия
candidate_cols = ["Purchase Amount (USD)", "Previous Purchases", "Review Rating", "Age"]
use_cols = [c for c in candidate_cols if c in df.columns]

for c in use_cols:
    plt.figure(figsize=(6,3))
    sns.barplot(data=df, x="cluster", y=c, estimator=np.mean)
    plt.title(f"Mean {c} by cluster")
    plt.tight_layout()
    plt.show()


In [None]:
def top_categories_by_cluster(col, topn=5):
    tmp = (df.groupby("cluster")[col]
           .value_counts()
           .groupby(level=0)
           .head(topn)
           .reset_index(name="count"))
    return tmp

cat_example = None
for c in ["Category", "Item Purchased", "Payment Method", "Season", "Gender", "Frequency of Purchases"]:
    if c in df.columns:
        cat_example = c
        break

print("Using categorical column:", cat_example)
top_df = top_categories_by_cluster(cat_example, topn=5)
display(top_df)

plt.figure(figsize=(8,4))
sns.barplot(data=top_df, x="count", y=cat_example, hue="cluster")
plt.title(f"Top-{5} {cat_example} by cluster")
plt.tight_layout()
plt.show()


In [None]:
OUT_DIR = os.path.join("..","results","figures")
os.makedirs(OUT_DIR, exist_ok=True)
# любой последний график можно сохранять через plt.savefig(...)
