In [1]:
import os
import json
import random
from pathlib import Path
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Optional: h5py to load processed_data.h5
try:
    import h5py
    H5PY_AVAILABLE = True
except Exception:
    H5PY_AVAILABLE = False

# ---------- USER PATHS (edit if different) ----------
OUTPUT_DIR = Path(r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection")
ARTIFACTS_PKL = OUTPUT_DIR / "artifacts.pkl"
PROCESSED_H5 = OUTPUT_DIR / "processed_data.h5"   # may exist
PROCESSED_NPZ = OUTPUT_DIR / "processed_data.npz" # fallback
ORIGINAL_CSV = Path(r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection\archive (1)\Groceries_dataset.csv")
# ----------------------------------------------------

# Evaluation config
TOP_N_ITEMS_FOR_HEATMAP = 30     # heatmap will use top-N most frequent items
EVAL_SAMPLE_LIMIT = 3000         # limit number of transactions to evaluate (for speed); set None to use all
K_VALUES = [1, 3, 5, 10]         # compute hit-rate@k for these k values
RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# ------------------ Utilities: load artifacts & processed data ------------------

def load_artifacts(pkl_path: Path) -> Dict[str, Any]:
    if not pkl_path.exists():
        raise FileNotFoundError(f"Artifacts file not found: {pkl_path}")
    artifacts = joblib.load(str(pkl_path))
    return artifacts

def load_processed_array(h5_path: Path = None, npz_path: Path = None) -> Tuple[np.ndarray, List[str], List[str]]:
    """
    Returns (data_np, index_list, columns_list)
    data_np shape: (n_transactions, n_items) dtype uint8 (0/1)
    index_list: list of transaction ids (strings)
    columns_list: list of item names (strings)
    """
    if h5_path and h5_path.exists() and H5PY_AVAILABLE:
        with h5py.File(str(h5_path), 'r') as hf:
            data = np.array(hf['data']).astype(np.uint8)
            try:
                index = [x.decode('utf-8') if isinstance(x, (bytes, np.bytes_)) else str(x) for x in hf['index'][()]]
            except Exception:
                index = [str(x) for x in hf['index'][()]]
            try:
                columns = [x.decode('utf-8') if isinstance(x, (bytes, np.bytes_)) else str(x) for x in hf['columns'][()]]
            except Exception:
                columns = [str(x) for x in hf['columns'][()]]
        return data, index, columns

    if npz_path and npz_path.exists():
        npz = np.load(str(npz_path), allow_pickle=True)
        data = npz['data'].astype(np.uint8)
        index = [str(x) for x in npz['index']]
        columns = [str(x) for x in npz['columns']]
        return data, index, columns

    raise FileNotFoundError("No processed_data.h5 or processed_data.npz found.")


def rebuild_basket_from_csv(csv_path: Path, tx_col_guess: str = None, item_col_guess: str = None):
    """
    Fallback: rebuild basket_df from original CSV. Uses same heuristics as earlier scripts.
    Returns (data_np, index_list, columns_list).
    """
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")
    df = pd.read_csv(str(csv_path), encoding='latin-1', low_memory=False)
    # Heuristic: find columns
    object_cols = [c for c in df.columns if df[c].dtype == object]
    tx_col = tx_col_guess or next((c for c in object_cols if c.lower().find('date') != -1 or c.lower().find('invoice') != -1 or c.lower().find('order') != -1), None)
    item_col = item_col_guess or next((c for c in object_cols if 'item' in c.lower() or 'description' in c.lower() or 'product' in c.lower()), None)
    if tx_col is None or item_col is None:
        # fallback to columns positions
        tx_col = tx_col or df.columns[0]
        item_col = item_col or df.columns[-1]
    # If Date & Member_number present, group by (Member_number, Date)
    if tx_col.lower() == 'date' and 'Member_number' in df.columns:
        grouped = df.groupby(['Member_number', 'Date'])[item_col].apply(lambda s: list(pd.Series(s).astype(str).str.strip().unique()))
    else:
        grouped = df.groupby(tx_col)[item_col].apply(lambda s: list(pd.Series(s).astype(str).str.strip().unique()))
    transactions = grouped.tolist()
    tx_index = grouped.index.tolist()
    # build one-hot
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer(sparse_output=False)
    basket = mlb.fit_transform(transactions)
    data = basket.astype(np.uint8)
    columns = list(mlb.classes_)
    index = [str(x) for x in tx_index]
    return data, index, columns


# ------------------ Recommender factory (same as in artifact script) ------------------

def make_recommender_from_rules_list(rules_list):
    """
    rules_list: list of dicts with keys 'antecedents','consequents','support','confidence','lift',...
    Returns recommend(given_items, top_n=5, metric='lift')
    """
    rules_internal = []
    for r in rules_list:
        rules_internal.append({
            'antecedents': set(r.get('antecedents', [])),
            'consequents': list(r.get('consequents', [])),
            'support': r.get('support', 0.0),
            'confidence': r.get('confidence', 0.0),
            'lift': r.get('lift', 0.0)
        })

    def recommend(given_items, top_n=5, metric='lift'):
        if isinstance(given_items, str):
            given = set([given_items])
        else:
            given = set(given_items)
        scores = {}
        for r in rules_internal:
            if r['antecedents'].issubset(given):
                for c in r['consequents']:
                    scores[c] = max(scores.get(c, 0.0), r.get(metric, 0.0))
        for g in list(given):
            scores.pop(g, None)
        sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, sc in sorted_items[:top_n]]

    return recommend


# ------------------ Evaluation: leave-one-out hit-rate@k ------------------

def prepare_transactions_list(data_np: np.ndarray, index: List[str], columns: List[str]) -> List[List[str]]:
    """
    Convert dense matrix to list of items per transaction.
    data_np: shape (n_tx, n_items)
    """
    tx_list = []
    for i in range(data_np.shape[0]):
        present_idx = np.where(data_np[i] > 0)[0]
        items = [columns[j] for j in present_idx]
        tx_list.append(items)
    return tx_list


def evaluate_hit_rate(rules_serial: List[Dict[str, Any]],
                      transactions_list: List[List[str]],
                      ks: List[int] = [1, 3, 5, 10],
                      sample_limit: int = None) -> Dict[int, float]:
    """
    Leave-one-out: for each transaction with >=2 items, hide one item and ask recommender to predict it.
    Computes hit-rate@k = fraction of cases where hidden item is in top-k recommendations.
    """
    # Build recommender
    recommender = make_recommender_from_rules_list(rules_serial)

    # prepare candidate transactions: those with length >= 2
    valid_tx_idx = [i for i, t in enumerate(transactions_list) if len(t) >= 2]
    if sample_limit is not None and len(valid_tx_idx) > sample_limit:
        valid_tx_idx = list(np.random.choice(valid_tx_idx, size=sample_limit, replace=False))

    hits = {k: 0 for k in ks}
    total = 0

    for idx in valid_tx_idx:
        t = transactions_list[idx]
        # random holdout one item
        hold = random.choice(t)
        given = [x for x in t if x != hold]
        if len(given) == 0:
            continue
        total += 1
        # get recommendations
        for k in ks:
            recs = recommender(given, top_n=k, metric='lift')
            if hold in recs:
                hits[k] += 1

    if total == 0:
        return {k: 0.0 for k in ks}

    hit_rates = {k: hits[k] / total for k in ks}
    return hit_rates


# ------------------ Heatmap: co-occurrence matrix ------------------

def compute_cooccurrence_matrix(data_np: np.ndarray, columns: List[str], top_n: int = 30, normalize: bool = False):
    """
    data_np: (n_tx, n_items) 0/1
    Returns (df_cooc, top_columns)
    df_cooc: pandas DataFrame top_n x top_n with co-occurrence counts (or normalized support)
    """
    # item frequencies
    item_counts = data_np.sum(axis=0)
    top_idx = np.argsort(-item_counts)[:top_n]
    top_cols = [columns[i] for i in top_idx]
    sub = data_np[:, top_idx].astype(np.uint8)

    # co-occurrence counts: item_i with item_j across transactions
    # compute via matrix multiplication
    cooc = np.dot(sub.T, sub)  # shape (top_n, top_n)
    df_cooc = pd.DataFrame(cooc, index=top_cols, columns=top_cols)

    if normalize:
        n_tx = data_np.shape[0]
        df_cooc = df_cooc / n_tx  # convert to support (0..1)
    return df_cooc, top_cols


# ------------------ Plotting ------------------

def plot_accuracy(hit_rates: Dict[int, float], out_path: Path):
    ks = sorted(hit_rates.keys())
    vals = [hit_rates[k] for k in ks]
    plt.figure(figsize=(6, 4))
    plt.plot(ks, vals, marker='o')
    plt.xticks(ks)
    plt.xlabel("k (top-k recommendations)")
    plt.ylabel("Hit-rate (Recall@k)")
    plt.title("Recommender Hit-rate vs k (leave-one-out)")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.savefig(str(out_path), dpi=150)
    plt.close()
    print(f"Saved accuracy plot: {out_path}")


def plot_heatmap(df_cooc: pd.DataFrame, out_path: Path, annot: bool = False, cmap: str = "YlGnBu"):
    plt.figure(figsize=(12, 10))
    # use logarithmic scaling when values vary a lot (optional)
    sns.heatmap(df_cooc, cmap=cmap, annot=annot, fmt=".2f" if df_cooc.values.dtype.kind == 'f' else "d")
    plt.title("Item Co-occurrence Heatmap (top items)")
    plt.tight_layout()
    plt.savefig(str(out_path), dpi=150)
    plt.close()
    print(f"Saved heatmap: {out_path}")


# ------------------ Main runner ------------------

def main():
    # 1) load artifacts (rules), and processed matrix (or rebuild)
    try:
        artifacts = load_artifacts(ARTIFACTS_PKL)
    except Exception as e:
        print(f"Warning: could not load artifacts.pkl: {e}")
        artifacts = {}

    # load rules_serial
    rules_serial = artifacts.get('rules_serial', None)

    # load processed data
    data_np = None
    columns = None
    index = None
    loaded_from = None

    try:
        data_np, index, columns = load_processed_array(PROCESSED_H5 if PROCESSED_H5.exists() else None,
                                                       PROCESSED_NPZ if PROCESSED_NPZ.exists() else None)
        loaded_from = "processed_data"
    except Exception as e:
        print(f"Could not load processed_data file: {e}. Will attempt to rebuild from CSV.")
        try:
            data_np, index, columns = rebuild_basket_from_csv(ORIGINAL_CSV)
            loaded_from = "rebuilt_from_csv"
        except Exception as e2:
            raise RuntimeError(f"Failed to load or rebuild processed data: {e2}")

    print(f"Loaded processed data ({loaded_from}). Transactions: {data_np.shape[0]}, Items: {data_np.shape[1]}")

    # If rules not present, try loading rules_json file
    if rules_serial is None:
        rules_json_path = OUTPUT_DIR / "association_rules.json"
        if rules_json_path.exists():
            with open(str(rules_json_path), 'r', encoding='utf-8') as fh:
                rules_serial = json.load(fh)
            print("Loaded rules from association_rules.json")
        else:
            print("No rules found in artifacts or association_rules.json — evaluation will produce zero metrics.")
            rules_serial = []

    # Build transactions list
    transactions_list = prepare_transactions_list(data_np, index, columns)

    # 2) Evaluate hit-rate@k
    hit_rates = evaluate_hit_rate(rules_serial, transactions_list, ks=K_VALUES, sample_limit=EVAL_SAMPLE_LIMIT)
    print("Hit-rates:", hit_rates)

    # Plot accuracy graph
    acc_out = OUTPUT_DIR / "accuracy_vs_k.png"
    plot_accuracy(hit_rates, acc_out)

    # 3) Compute co-occurrence heatmap (top N)
    df_cooc, top_cols = compute_cooccurrence_matrix(data_np, columns, top_n=TOP_N_ITEMS_FOR_HEATMAP, normalize=False)
    heatmap_out = OUTPUT_DIR / "cooccurrence_heatmap_topN.png"
    plot_heatmap(df_cooc, heatmap_out, annot=False)

    # Save a small CSV summary
    summary = {
        "hit_rates": hit_rates,
        "n_transactions": int(data_np.shape[0]),
        "n_items": int(data_np.shape[1]),
        "top_items_for_heatmap": top_cols
    }
    with open(str(OUTPUT_DIR / "evaluation_summary.json"), 'w', encoding='utf-8') as fh:
        json.dump(summary, fh, indent=2)
    print("Saved evaluation_summary.json")

if __name__ == "__main__":
    main()


Loaded processed data (processed_data). Transactions: 14963, Items: 167
Hit-rates: {1: 0.0, 3: 0.0, 5: 0.0, 10: 0.0}
Saved accuracy plot: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\accuracy_vs_k.png
Saved heatmap: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\cooccurrence_heatmap_topN.png
Saved evaluation_summary.json
