In [10]:
import json
import pickle
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import yaml
from sklearn.linear_model import LogisticRegression

# --- Optional: real LSTM if TensorFlow is available; otherwise fallback still writes .h5 ---
try:
    import tensorflow as tf  # noqa: F401
    from tensorflow.keras import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    TF_AVAILABLE = True
except Exception:
    TF_AVAILABLE = False

try:
    import h5py  # required to write .h5 even in fallback
except Exception as e:
    raise SystemExit("h5py is required. Install with: pip install h5py") from e


# ---------------------- CONFIG (Windows-safe raw strings) ----------------------
DATA_CSV = r"C:\Users\sagni\Downloads\Price Sense\archive\flipkart_com-ecommerce_sample.csv"
ARTIFACT_DIR = r"C:\Users\sagni\Downloads\Price Sense"

SETTINGS = {
    "seed_days": 60,            # synthetic days of price history for demo
    "price_noise_frac": 0.08,   # ±8% noise
    "deal_drop_prob": 0.07,     # 7% chance of extra drop
    "deal_drop_frac": 0.12,     # 12% extra drop on deal days
    "top_n_products": 300,      # limit rows for faster first run
    "lstm_sequence_len": 7,
    "lstm_epochs": 5,
    "lstm_batch_size": 64,
}
# -----------------------------------------------------------------------------


# ---------------------------- Utility functions ------------------------------
def ensure_dir(path: str | Path) -> Path:
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p


def load_csv(csv_path: str | Path) -> pd.DataFrame:
    p = Path(csv_path)
    if not p.exists():
        raise FileNotFoundError(
            f"CSV not found at:\n{p}\n"
            "Tip: verify the folder and file name. If the path has spaces, keep it as a raw string (r'...')."
        )
    try:
        return pd.read_csv(p, encoding="utf-8")
    except UnicodeDecodeError:
        return pd.read_csv(p, encoding="latin1")


def map_columns(df: pd.DataFrame) -> Dict[str, str | None]:
    cols = {c.lower(): c for c in df.columns}

    def find(name: str):
        for k, v in cols.items():
            if name in k:
                return v
        return None

    return {
        "title": find("product_name") or find("title") or list(df.columns)[0],
        "category": find("product_category_tree") or find("category"),
        "brand": find("brand"),
        "retail_price": find("retail_price") or find("mrp") or find("price"),
        "discounted_price": find("discounted_price") or find("discount_price"),
        "product_id": find("product_id") or find("pid"),
        "rating": find("product_rating") or find("rating"),
        "description": find("description") or find("product_description"),
        "url": find("product_url") or find("url"),
    }


def synthetic_series(base_price: float,
                     days: int,
                     noise_frac: float,
                     deal_prob: float,
                     deal_frac: float) -> np.ndarray:
    prices = []
    p = float(max(1.0, base_price))
    for _ in range(days):
        noise = np.random.uniform(-noise_frac, noise_frac) * p
        p = max(1.0, p + noise)
        if random.random() < deal_prob:
            p = max(1.0, p * (1.0 - deal_frac))
        prices.append(p)
    return np.array(prices, dtype=np.float32)


def make_sequences(series: np.ndarray, seq_len: int) -> Tuple[np.ndarray, np.ndarray]:
    X, y = [], []
    for i in range(len(series) - seq_len):
        X.append(series[i:i + seq_len])
        y.append(series[i + seq_len])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)


def train_lstm_and_save(all_series: List[np.ndarray],
                        seq_len: int,
                        epochs: int,
                        batch_size: int,
                        h5_out: Path) -> Dict:
    """
    Trains a tiny LSTM if TensorFlow is available; otherwise writes a valid .h5 placeholder.
    """
    metrics: Dict = {"trained": False, "framework": "tensorflow" if TF_AVAILABLE else "fallback"}

    # Build dataset (normalize each series by its max)
    Xs, ys = [], []
    for s in all_series:
        s = np.asarray(s, dtype=np.float32)
        if len(s) <= seq_len + 1:
            continue
        scale = max(1e-6, float(s.max()))
        s_norm = s / scale
        X, y = make_sequences(s_norm, seq_len)
        Xs.append(X[..., None])   # add feature dim (T, 1)
        ys.append(y)

    if not Xs:
        ensure_dir(h5_out.parent)
        with h5py.File(h5_out, "w") as f:
            f.create_dataset("note", data=np.string_("No data to train."))
        metrics.update({"trained": False, "reason": "no_data"})
        return metrics

    X_all = np.vstack(Xs)
    y_all = np.hstack(ys)

    if TF_AVAILABLE:
        try:
            model = Sequential([
                LSTM(32, input_shape=(X_all.shape[1], 1), return_sequences=False),
                Dropout(0.2),
                Dense(16, activation="relu"),
                Dense(1),
            ])
            model.compile(optimizer="adam", loss="mse")
            hist = model.fit(
                X_all, y_all,
                epochs=int(epochs),
                batch_size=int(batch_size),
                verbose=0,
                validation_split=0.2,
                shuffle=True,
            )
            ensure_dir(h5_out.parent)
            model.save(h5_out)
            metrics.update({
                "trained": True,
                "epochs": int(epochs),
                "final_loss": float(hist.history["loss"][-1]),
                "final_val_loss": float(hist.history.get("val_loss", [0])[-1]),
            })
            return metrics
        except Exception as e:
            metrics.update({"trained": False, "error": f"TF failed: {e}"})

    # Fallback: write minimal HDF5 note
    ensure_dir(h5_out.parent)
    with h5py.File(h5_out, "w") as f:
        f.create_dataset("note", data=np.string_("TensorFlow unavailable; using placeholder model."))
    return metrics


# -------------------- Fake Review (heuristic) model --------------------
@dataclass
class ReviewFeatures:
    length: int
    exclam: int
    caps_ratio: float
    unique_ratio: float
    digits: int
    words: int


def featurize(text: str) -> ReviewFeatures:
    import re
    t = text or ""
    words = re.findall(r"[A-Za-z0-9']+", t)
    words_lower = [w.lower() for w in words]
    unique_ratio = (len(set(words_lower)) / (len(words_lower) + 1e-6))
    caps_ratio = (sum(1 for c in t if c.isupper()) / (len(t) + 1e-6))
    digits = sum(ch.isdigit() for ch in t)
    return ReviewFeatures(
        length=len(t),
        exclam=t.count("!"),
        caps_ratio=float(caps_ratio),
        unique_ratio=float(unique_ratio),
        digits=int(digits),
        words=len(words),
    )


def weak_label(text: str) -> int:
    """Heuristic: 1 = suspicious, 0 = genuine."""
    f = featurize(text)
    if f.length < 25: return 1
    if f.exclam >= 3: return 1
    if f.caps_ratio > 0.35: return 1
    if f.unique_ratio < 0.4 and f.words > 6: return 1
    return 0


def train_fake_review_model(texts: List[str], out_pkl: Path) -> Dict:
    from sklearn.linear_model import LogisticRegression
    X, y = [], []
    for t in texts:
        f = featurize(t)
        X.append([f.length, f.exclam, f.caps_ratio, f.unique_ratio, f.digits, f.words])
        y.append(weak_label(t))
    if not X:
        X = [[10, 0, 0.0, 1.0, 0, 2]]
        y = [0]

    X = np.array(X, dtype=float)
    y = np.array(y, dtype=int)

    clf = LogisticRegression(max_iter=200)
    clf.fit(X, y)

    ensure_dir(out_pkl.parent)
    with open(out_pkl, "wb") as f:
        pickle.dump(clf, f)

    return {
        "trained": True,
        "n_reviews": int(len(texts)),
        "pos_rate": float(y.mean()) if len(y) else 0.0,
        "features": ["length", "exclam", "caps_ratio", "unique_ratio", "digits", "words"],
    }


# ------------------------------------ Main ------------------------------------
def main():
    np.random.seed(42)
    random.seed(42)

    # Paths (raw strings) -> Path objects
    csv_path = Path(DATA_CSV)
    out_dir = ensure_dir(ARTIFACT_DIR)

    # Pre-flight echo to prove paths are literal (avoids unicodeescape issues)
    print(f"[INFO] CSV path:      {csv_path}")
    print(f"[INFO] Artifact dir:  {out_dir}")

    # 1) Save config snapshot (yaml)
    yaml_out = out_dir / "config.yaml"
    cfg = {"data_csv": str(csv_path), "artifact_dir": str(out_dir), **SETTINGS}
    with open(yaml_out, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)

    # 2) Load CSV & map columns
    df = load_csv(csv_path)
    cols = map_columns(df)
    df = df.head(int(SETTINGS["top_n_products"])).copy()

    # 3) Build synthetic price series + pseudo review corpus
    price_series_list: List[np.ndarray] = []
    review_corpus: List[str] = []

    title_col = cols["title"]
    desc_col = cols["description"]
    retail_col = cols["retail_price"]
    disc_col = cols["discounted_price"]

    for _, r in df.iterrows():
        # base price
        if disc_col and not pd.isna(r.get(disc_col)):
            base = float(r.get(disc_col))
        elif retail_col and not pd.isna(r.get(retail_col)):
            base = float(r.get(retail_col))
        else:
            base = 100.0

        series = synthetic_series(
            base_price=base,
            days=int(SETTINGS["seed_days"]),
            noise_frac=float(SETTINGS["price_noise_frac"]),
            deal_prob=float(SETTINGS["deal_drop_prob"]),
            deal_frac=float(SETTINGS["deal_drop_frac"]),
        )
        price_series_list.append(series)

        # pseudo reviews from description/title slices
        text_src = str(r.get(desc_col)) if desc_col and not pd.isna(r.get(desc_col)) else str(r.get(title_col))
        text_src = text_src or "Good product with decent quality and value for money."
        for chunk in [text_src[:140], text_src[140:280], text_src[280:420]]:
            if chunk and len(chunk.strip()) > 10:
                review_corpus.append(chunk.strip())

    # 4) Train LSTM (or fallback) -> .h5
    h5_out = out_dir / "price_lstm.h5"
    lstm_metrics = train_lstm_and_save(
        price_series_list,
        seq_len=int(SETTINGS["lstm_sequence_len"]),
        epochs=int(SETTINGS["lstm_epochs"]),
        batch_size=int(SETTINGS["lstm_batch_size"]),
        h5_out=h5_out,
    )

    # 5) Train fake-review model -> .pkl
    pkl_out = out_dir / "fake_review_lr.pkl"
    fr_metrics = train_fake_review_model(review_corpus, pkl_out)

    # 6) metrics.json
    json_out = out_dir / "metrics.json"
    metrics = {
        "price_model": lstm_metrics,
        "fake_review_model": fr_metrics,
        "counts": {"n_products": int(len(price_series_list)), "n_reviews": int(len(review_corpus))},
    }
    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)

    print("\n=== PriceSense ML Artifacts Ready ===")
    print(f"H5   -> {h5_out}")
    print(f"JSON -> {json_out}")
    print(f"YAML -> {yaml_out}")
    print(f"PKL  -> {pkl_out}")
    if not TF_AVAILABLE:
        print("Note: TensorFlow not detected; wrote a safe placeholder .h5. Install TF for full LSTM training.")


if __name__ == "__main__":
    main()


[INFO] CSV path:      C:\Users\sagni\Downloads\Price Sense\archive\flipkart_com-ecommerce_sample.csv
[INFO] Artifact dir:  C:\Users\sagni\Downloads\Price Sense


  super().__init__(**kwargs)



=== PriceSense ML Artifacts Ready ===
H5   -> C:\Users\sagni\Downloads\Price Sense\price_lstm.h5
JSON -> C:\Users\sagni\Downloads\Price Sense\metrics.json
YAML -> C:\Users\sagni\Downloads\Price Sense\config.yaml
PKL  -> C:\Users\sagni\Downloads\Price Sense\fake_review_lr.pkl
