In [1]:
# Example: Naive Bayes for retail sales (binary high_value prediction)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)

# --- synthetic example dataset (replace with your retail data) ---
np.random.seed(0)
n = 2000
df = pd.DataFrame({
    "customer_age": np.random.randint(18, 70, size=n),
    "days_since_last_visit": np.random.poisson(30, size=n),
    "product_category": np.random.choice(["A", "B", "C", "D"], size=n),
    "channel": np.random.choice(["web", "app", "store"], size=n),
})
# target: high value order (simulate)
df["order_value"] = (50 + df["customer_age"]*0.5 + np.random.normal(0, 20, size=n)
                     - 0.2*df["days_since_last_visit"] + (df["product_category"]=="A")*30)
df["high_value"] = (df["order_value"] >= 100).astype(int)
y = df["high_value"]
X = df[["customer_age", "days_since_last_visit", "product_category", "channel"]]

# --- preprocessing: numeric scaling + one-hot for categoricals ---
numeric_features = ["customer_age", "days_since_last_visit"]
cat_features = ["product_category", "channel"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
])

# GaussianNB expects dense numeric inputs
model = Pipeline([
    ("pre", preprocessor),
    ("clf", GaussianNB()),
])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# fit
model.fit(X_train, y_train)

# predict and probs
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# cross-validation example (stratified)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=1)
print("5-fold CV ROC AUC:", cv_scores.mean(), "+/-", cv_scores.std())

Accuracy: 0.8125
Precision: 0.3956043956043956
Recall: 0.6428571428571429
F1: 0.4897959183673469
ROC AUC: 0.810189991694352
Confusion matrix:
 [[289  55]
 [ 20  36]]

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.84      0.89       344
           1       0.40      0.64      0.49        56

    accuracy                           0.81       400
   macro avg       0.67      0.74      0.69       400
weighted avg       0.86      0.81      0.83       400

5-fold CV ROC AUC: 0.8050162856714639 +/- 0.024610412754549362
