In [None]:
import os, sys
sys.path.append(os.path.abspath(".."))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

from src.data_preprocessing import fit_transform_preprocess

sns.set_theme(style="whitegrid")

DATA_PATH = os.path.join("..","data","raw","shopping_behavior.csv")
df = pd.read_csv(DATA_PATH)

X, preprocess, num_cols, cat_cols = fit_transform_preprocess(df)
print("X shape:", X.shape)


ModuleNotFoundError: No module named 'src'

In [None]:
ks = range(2, 11)
scores = []

for k in ks:
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X)
    score = silhouette_score(X, labels)
    scores.append(score)

plt.figure(figsize=(6,3))
plt.plot(list(ks), scores, marker="o")
plt.xlabel("k")
plt.ylabel("Silhouette score")
plt.title("KMeans: Silhouette by k")
plt.tight_layout()
plt.show()

best_k = list(ks)[int(np.argmax(scores))]
print("Best k:", best_k, "best silhouette:", max(scores))


In [None]:
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
k_labels = kmeans.fit_predict(X)

pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X.toarray() if hasattr(X, "toarray") else X)

plt.figure(figsize=(6,4))
plt.scatter(X2[:,0], X2[:,1], c=k_labels, s=12)
plt.title(f"KMeans clusters (k={best_k}) on PCA-2D")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()


In [None]:
db = DBSCAN(eps=0.8, min_samples=8)
db_labels = db.fit_predict(X)

n_clusters = len(set(db_labels)) - (1 if -1 in db_labels else 0)
n_noise = (db_labels == -1).sum()
print("DBSCAN clusters:", n_clusters, "noise points:", n_noise)

# Silhouette можно считать только если есть >=2 кластера (без шума)
mask = db_labels != -1
if len(set(db_labels[mask])) >= 2:
    db_sil = silhouette_score(X[mask], db_labels[mask])
    print("DBSCAN silhouette (without noise):", db_sil)
else:
    print("DBSCAN silhouette: not enough clusters to compute")


In [None]:
OUT_DIR = os.path.join("..", "results", "figures")
os.makedirs(OUT_DIR, exist_ok=True)
