In [None]:
import os, sys, json
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
import joblib

from src.data_preprocessing import build_preprocess_pipeline

sns.set_theme(style="whitegrid")

DATA_PATH = os.path.join("..", "data", "raw", "shopping_behavior.csv")
df = pd.read_csv(DATA_PATH)

df.shape


In [None]:
df = df.copy()
df["target"] = df["Subscription Status"].astype(str).str.strip().str.lower().map({"yes": 1, "no": 0})

# проверка
print(df["target"].value_counts(dropna=False))


In [None]:
X_df = df.drop(columns=["target"])  # оставляем все признаки, включая Subscription Status как фичу? нет!
# важно: Subscription Status нельзя оставлять в признаках, иначе утечка таргета:
X_df = X_df.drop(columns=["Subscription Status"])

y = df["target"]

preprocess, num_cols, cat_cols = build_preprocess_pipeline(X_df)

X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


In [None]:
rf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced",
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_proba = rf.predict_proba(X_test)[:, 1]

print("RandomForest report:\n", classification_report(y_test, rf_pred))
print("RF ROC-AUC:", roc_auc_score(y_test, rf_proba))

cm = confusion_matrix(y_test, rf_pred)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("RF Confusion Matrix")
plt.xlabel("Pred")
plt.ylabel("True")
plt.tight_layout()
plt.show()

RocCurveDisplay.from_predictions(y_test, rf_proba)
plt.title("RF ROC curve")
plt.tight_layout()
plt.show()


In [None]:
svm = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", SVC(
        C=2.0,
        kernel="rbf",
        probability=True,
        class_weight="balanced",
        random_state=42
    ))
])

svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_proba = svm.predict_proba(X_test)[:, 1]

print("SVM report:\n", classification_report(y_test, svm_pred))
print("SVM ROC-AUC:", roc_auc_score(y_test, svm_proba))


In [None]:
OUT_FIG = os.path.join("..", "results", "figures")
OUT_REP = os.path.join("..", "results", "reports")
OUT_MOD = os.path.join("..", "models")

os.makedirs(OUT_FIG, exist_ok=True)
os.makedirs(OUT_REP, exist_ok=True)
os.makedirs(OUT_MOD, exist_ok=True)

rf_auc = roc_auc_score(y_test, rf_proba)
svm_auc = roc_auc_score(y_test, svm_proba)

best_name, best_model, best_auc, best_proba = ("RF", rf, rf_auc, rf_proba) if rf_auc >= svm_auc else ("SVM", svm, svm_auc, svm_proba)
print("Best:", best_name, best_auc)

joblib.dump(best_model, os.path.join(OUT_MOD, "best_model.joblib"))

metrics = {"rf_auc": float(rf_auc), "svm_auc": float(svm_auc), "best": best_name, "best_auc": float(best_auc)}
with open(os.path.join(OUT_REP, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

RocCurveDisplay.from_predictions(y_test, best_proba)
plt.title(f"Best ROC curve: {best_name}")
plt.tight_layout()
plt.savefig(os.path.join(OUT_FIG, "best_roc_curve.png"), dpi=150)
plt.close()

print("Saved: models/best_model.joblib, results/reports/metrics.json, results/figures/best_roc_curve.png")


In [None]:
r = permutation_importance(best_model, X_test, y_test, n_repeats=8, random_state=42, n_jobs=-1)

imp = pd.Series(r.importances_mean, index=X_test.columns).sort_values(ascending=False).head(12)

plt.figure(figsize=(8,4))
sns.barplot(x=imp.values, y=imp.index)
plt.title("Top-12 feature importance (permutation)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_FIG, "feature_importance_top12.png"), dpi=150)
plt.show()
