<a href="https://colab.research.google.com/github/polamsumanth/B3_PFDS_1121/blob/main/ZARA_DATASET_CODE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# ============================
# SALES PREDICTION: RANDOM FOREST (IEEE-ready)
# Dataset: zara.csv
# Target: High Sales (Sales Volume >= median)
# Figures: confusion_matrix.png, roc_curve.png, pr_curve.png, feature_importance.png
# Tables: model_report.txt, feature_importance.csv, cv_scores.csv
# ============================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    precision_recall_curve,
    auc,
    accuracy_score
)

# ----------------------------
# 0) Paths / Output directory
# ----------------------------
DATA_PATH = "//content/zara.csv"  # Corrected file path
OUTDIR = "ieee_rf_outputs_zara"
os.makedirs(OUTDIR, exist_ok=True)

# ----------------------------
# 1) Load & quick checks
# ----------------------------
df = pd.read_csv(DATA_PATH, sep=";")

# Quick exports for appendix
df.head(20).to_csv(os.path.join(OUTDIR, "preview_head20.csv"), index=False)
df.describe(include="all").to_csv(os.path.join(OUTDIR, "describe_all.csv"))

# ----------------------------
# 2) Target: High Sales
# ----------------------------
df = df.copy()
median_sales = df["Sales Volume"].median()
df["HighSales"] = (df["Sales Volume"] >= median_sales).astype(int)

# ----------------------------
# 3) Feature Engineering
# ----------------------------
# Example: normalize price
df["LogPrice"] = np.log1p(df["price"])

# ----------------------------
# 4) Feature Selection
# ----------------------------
features = [
    "Product Position",
    "Promotion",
    "Product Category",
    "Seasonal",
    "price",
    "LogPrice",
    "section"
]

features = [f for f in features if f in df.columns]
X = pd.get_dummies(df[features], drop_first=True)  # one-hot encode categoricals
y = df["HighSales"]

# ----------------------------
# 5) Train/Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------
# 6) Train Random Forest
# ----------------------------
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# ----------------------------
# 7) Evaluation
# ----------------------------
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# Text report
report = classification_report(y_test, y_pred, digits=3)
with open(os.path.join(OUTDIR, "model_report.txt"), "w") as f:
    f.write(report)

print(report)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))
plt.imshow(cm, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix: High vs Low Sales")
plt.xticks([0,1], ["Pred: Low", "Pred: High"])
plt.yticks([0,1], ["Actual: Low", "Actual: High"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "confusion_matrix.png"), dpi=300)
plt.close()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve: High Sales Prediction (RF)")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "roc_curve.png"), dpi=300)
plt.close()

# Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(rec, prec)
plt.figure(figsize=(5,5))
plt.plot(rec, prec, label=f"PR AUC = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve: High Sales Prediction (RF)")
plt.legend(loc="lower left")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "pr_curve.png"), dpi=300)
plt.close()

# ----------------------------
# 8) Cross-Validation
# ----------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_validate(
    rf, X, y,
    cv=cv,
    scoring={"roc_auc": "roc_auc", "f1": "f1", "accuracy": "accuracy"},
    n_jobs=-1
)
pd.DataFrame(cv_scores).to_csv(os.path.join(OUTDIR, "cv_scores.csv"), index=False)

# ----------------------------
# 9) Feature Importance
# ----------------------------
importances = rf.feature_importances_
feat_imp = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

feat_imp.to_csv(os.path.join(OUTDIR, "feature_importance.csv"), index=False)

# Plot top 15
top = feat_imp.head(15).iloc[::-1]
plt.figure(figsize=(7,7))
plt.barh(top["feature"], top["importance"])
plt.xlabel("Feature Importance (RF)")
plt.title("Top 15 Predictors of High Sales (Random Forest)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "feature_importance.png"), dpi=300)
plt.close()

# ----------------------------
# 10) Compact Summary
# ----------------------------
summary = {
    "test_accuracy": accuracy_score(y_test, y_pred),
    "test_roc_auc": roc_auc,
    "test_pr_auc": pr_auc
}
print("=== SUMMARY (RF) ===")
for k, v in summary.items():
    print(f"{k}: {v:.3f}")

print(f"\nArtifacts saved in: ./{OUTDIR}")

              precision    recall  f1-score   support

           0      0.400     0.308     0.348        26
           1      0.419     0.520     0.464        25

    accuracy                          0.412        51
   macro avg      0.410     0.414     0.406        51
weighted avg      0.409     0.412     0.405        51

Accuracy: 0.4117647058823529
=== SUMMARY (RF) ===
test_accuracy: 0.412
test_roc_auc: 0.367
test_pr_auc: 0.393

Artifacts saved in: ./ieee_rf_outputs_zara
