In [3]:
"""
generate_artifacts_with_serial_rules.py

Memory-aware market-basket artifact generator.
Saves serializable rules (rules_serial) and provides a top-level factory to recreate recommend().

Paths are hard-coded per user's request; adjust if needed.
"""

import os
import json
import datetime
import warnings
from pathlib import Path
from typing import Tuple, List, Dict, Any

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
import yaml

# ---------- USER PATHS ----------
INPUT_CSV = r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection\archive (1)\Groceries_dataset.csv"
OUTPUT_DIR = r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection"
# -------------------------------

# Mining config (tweak if needed)
INITIAL_MIN_SUPPORT = 0.01
MIN_CONFIDENCE = 0.20
MIN_SUPPORT_STEP = 0.01
MAX_RETRIES = 4
MAX_ITEMSET_LEN = None  # set to 3 to limit itemset sizes if memory/time is an issue

# ------------------ Utility & Processing ------------------

def ensure_output_dir(path: str) -> Path:
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p

def load_dataset(csv_path: str) -> pd.DataFrame:
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Input CSV not found: {csv_path}")
    try:
        df = pd.read_csv(csv_path)
    except Exception:
        df = pd.read_csv(csv_path, encoding='latin-1')
    return df

def guess_columns(df: pd.DataFrame) -> Tuple[str, str]:
    object_cols = [c for c, dt in df.dtypes.items() if pd.api.types.is_object_dtype(dt) or pd.api.types.is_string_dtype(dt)]
    tx_candidates, item_candidates = [], []
    tx_names = ["invoice", "invoiceno", "transaction", "orderid", "order_id", "date", "receipt", "id"]
    item_names = ["item", "itemdescription", "product", "description", "item_name"]
    for col in object_cols:
        cl = col.lower()
        if any(name in cl for name in tx_names):
            tx_candidates.append(col)
        if any(name in cl for name in item_names):
            item_candidates.append(col)
    tx_col = tx_candidates[0] if tx_candidates else (object_cols[0] if object_cols else None)
    item_col = item_candidates[0] if item_candidates else (object_cols[1] if len(object_cols) >= 2 else (object_cols[0] if object_cols else None))
    return tx_col, item_col

def preprocess_transactions(df: pd.DataFrame, tx_col: str, item_col: str):
    df = df.copy()
    df[item_col] = df[item_col].astype(str).str.strip()
    df = df[df[item_col].notna() & (df[item_col] != '')]

    if tx_col is None:
        df['_TX_AUTO_'] = range(len(df))
        tx_col = '_TX_AUTO_'

    # If Date was chosen and Member_number present, group by member+date (session-level)
    if tx_col.lower() == 'date' and 'Member_number' in df.columns:
        grouped = df.groupby(['Member_number', 'Date'])[item_col].apply(lambda s: list(pd.Series(s).astype(str).str.strip().unique()))
    else:
        grouped = df.groupby(tx_col)[item_col].apply(lambda s: list(pd.Series(s).astype(str).str.strip().unique()))

    transactions = grouped.tolist()
    tx_index = grouped.index.tolist()

    # Try sparse MultiLabelBinarizer if supported
    try:
        mlb = MultiLabelBinarizer(sparse_output=True)
        basket_sparse = mlb.fit_transform(transactions)
        basket_df = pd.DataFrame.sparse.from_spmatrix(basket_sparse, index=tx_index, columns=mlb.classes_)
    except TypeError:
        mlb = MultiLabelBinarizer(sparse_output=False)
        basket = mlb.fit_transform(transactions)
        basket_df = pd.DataFrame(basket, index=tx_index, columns=mlb.classes_)

    # Make sure dtype is numeric 0/1 (use int8 to save memory)
    try:
        first_dtype = basket_df.dtypes[0]
        if pd.api.types.is_sparse(first_dtype):
            basket_df = basket_df.astype('int8')
        else:
            basket_df = basket_df.astype('int8')
    except Exception:
        basket_df = basket_df.astype('int8')

    return transactions, basket_df, mlb

def prune_items_by_support(basket_df: pd.DataFrame, min_support: float):
    n_tx = basket_df.shape[0]
    item_counts = basket_df.sum(axis=0)
    item_supports = item_counts / float(n_tx)
    keep_items = item_supports[item_supports >= min_support].index.tolist()
    pruned_df = basket_df.loc[:, keep_items].copy()
    return pruned_df, item_supports

def run_mining_with_fallback(basket_df: pd.DataFrame, min_support=0.01, min_confidence=0.2, max_len=None):
    cur_support = min_support
    retries = 0
    while True:
        print(f"[mining] Attempt with min_support={cur_support:.4f}")
        try:
            pruned_df, item_supports = prune_items_by_support(basket_df, cur_support)
            print(f"[prune] Transactions: {pruned_df.shape[0]}, Items after prune: {pruned_df.shape[1]}")
            if pruned_df.shape[1] == 0:
                raise ValueError("No items left after pruning. Raise dataset quality or lower min_support.")

            bool_df = pruned_df.astype(bool)
            frequent_itemsets = fpgrowth(bool_df, min_support=cur_support, use_colnames=True, max_len=max_len)

            if frequent_itemsets.empty:
                rules = pd.DataFrame()
            else:
                frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
                rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
                if not rules.empty:
                    rules = rules.sort_values(['lift', 'confidence', 'support'], ascending=[False, False, False])

            return frequent_itemsets, rules, cur_support

        except MemoryError as me:
            retries += 1
            warnings.warn(f"MemoryError during mining attempt {retries}: {me}. Increasing min_support and retrying.")
            cur_support += MIN_SUPPORT_STEP
            if retries > MAX_RETRIES:
                raise MemoryError(f"Mining failed after {retries} retries.") from me

# ------------------ Saving processed data (robust) ------------------

def save_processed_data_array(basket_df: pd.DataFrame, out_base: Path):
    """
    Save processed basket as HDF5 via h5py if present; otherwise as compressed NPZ.
    Stores:
      - data: 2D uint8 array (transactions x items)
      - index: array of strings
      - columns: array of strings
    """
    try:
        data_np = basket_df.to_numpy(dtype=np.uint8)
    except Exception:
        data_np = np.array(basket_df.values, dtype=np.uint8)

    index_arr = np.array([str(x) for x in basket_df.index], dtype=object)
    cols_arr = np.array([str(x) for x in basket_df.columns], dtype=object)

    # Try h5py first
    try:
        import h5py
        h5_path = out_base.with_suffix('.h5')
        with h5py.File(str(h5_path), 'w') as hf:
            hf.create_dataset('data', data=data_np, compression='gzip', compression_opts=4)
            dt = h5py.string_dtype(encoding='utf-8')
            hf.create_dataset('index', data=index_arr.astype('S'), dtype=dt)
            hf.create_dataset('columns', data=cols_arr.astype('S'), dtype=dt)
            hf.attrs['saved_on'] = datetime.datetime.now().isoformat()
        print(f"Saved HDF5 via h5py: {h5_path}")
        return h5_path
    except Exception:
        npz_path = out_base.with_suffix('.npz')
        np.savez_compressed(str(npz_path), data=data_np, index=index_arr, columns=cols_arr)
        print(f"h5py not available or writing failed — saved compressed NPZ: {npz_path}")
        return npz_path

def save_pickle(obj, out_path: str):
    joblib.dump(obj, out_path)
    print(f"Saved PKL: {out_path}")

def save_yaml(data: dict, out_path: str):
    with open(out_path, 'w', encoding='utf-8') as f:
        yaml.safe_dump(data, f, sort_keys=False)
    print(f"Saved YAML: {out_path}")

def save_json(data, out_path: str):
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, default=str)
    print(f"Saved JSON: {out_path}")

# ------------------ Rules serialization & recommender factory ------------------

def rules_to_serializable(rules_df: pd.DataFrame) -> List[Dict[str, Any]]:
    if rules_df is None or rules_df.empty:
        return []
    serial = []
    for _, row in rules_df.iterrows():
        serial.append({
            "antecedents": sorted(list(row['antecedents'])) if hasattr(row['antecedents'], '__iter__') else [row['antecedents']],
            "consequents": sorted(list(row['consequents'])) if hasattr(row['consequents'], '__iter__') else [row['consequents']],
            "support": float(row.get('support', np.nan)),
            "confidence": float(row.get('confidence', np.nan)),
            "lift": float(row.get('lift', np.nan)),
            "leverage": float(row.get('leverage', np.nan)) if 'leverage' in row else None,
            "conviction": float(row.get('conviction', np.nan)) if 'conviction' in row else None
        })
    return serial

def make_recommender_from_rules_list(rules_list: List[Dict[str, Any]]):
    """
    Top-level factory to create a recommend() function from serializable rules.
    Returns (recommend_function, internal_rules)
    """
    # Convert antecedents to sets for fast subset checks
    rules_internal = []
    for r in rules_list:
        rules_internal.append({
            'antecedents': set(r.get('antecedents', [])),
            'consequents': list(r.get('consequents', [])),
            'support': r.get('support', 0.0),
            'confidence': r.get('confidence', 0.0),
            'lift': r.get('lift', 0.0)
        })

    def recommend(given_items, top_n=5, metric='lift'):
        if isinstance(given_items, str):
            given = set([given_items])
        else:
            given = set(given_items)
        scores = {}
        for r in rules_internal:
            if r['antecedents'].issubset(given):
                for c in r['consequents']:
                    scores[c] = max(scores.get(c, 0.0), r.get(metric, 0.0))
        for g in list(given):
            scores.pop(g, None)
        sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, sc in sorted_items[:top_n]]

    return recommend, rules_internal

# ------------------ Main flow ------------------

def main():
    out_dir = ensure_output_dir(OUTPUT_DIR)
    print("Loading dataset...")
    df = load_dataset(INPUT_CSV)
    print(f"Dataset loaded. Shape: {df.shape}. Columns: {list(df.columns)}")

    tx_col, item_col = guess_columns(df)
    print(f"Guessed transaction column: {tx_col}, item column: {item_col}")

    if tx_col is None or item_col is None:
        raise ValueError("Could not determine transaction and/or item columns. Please inspect dataset and adjust script.")

    print("Preprocessing transactions and building basket matrix...")
    transactions, basket_df, mlb = preprocess_transactions(df, tx_col, item_col)
    print(f"Built basket matrix. Transactions: {len(transactions)}, Unique items: {basket_df.shape[1]}")

    # Save processed data (h5 via h5py or npz fallback)
    processed_base = out_dir / "processed_data"
    processed_path = save_processed_data_array(basket_df, processed_base)

    # Run mining with fallback
    frequent_itemsets, rules_df, used_support = run_mining_with_fallback(
        basket_df,
        min_support=INITIAL_MIN_SUPPORT,
        min_confidence=MIN_CONFIDENCE,
        max_len=MAX_ITEMSET_LEN
    )
    print(f"Mining finished (used_min_support={used_support:.4f}). Found {len(frequent_itemsets)} frequent_itemsets and {len(rules_df)} rules.")

    # Convert rules to serializable form (store this in artifacts)
    rules_serial = rules_to_serializable(rules_df)

    # Save artifacts (do NOT include nested functions)
    artifacts = {
        "mlb": mlb,
        "basket_df_index": list(map(str, basket_df.index)),
        "basket_columns": list(map(str, basket_df.columns)),
        "frequent_itemsets": frequent_itemsets,  # DataFrame picklable
        "rules_df": rules_df,                   # DataFrame picklable
        "rules_serial": rules_serial,           # JSON-serializable rules
        "raw_dataframe_head": df.head(200)
    }
    pkl_path = out_dir / "artifacts.pkl"
    save_pickle(artifacts, str(pkl_path))

    # Save metadata YAML
    metadata = {
        "source_csv": INPUT_CSV,
        "generated_on": datetime.datetime.now().isoformat(),
        "n_rows_raw": int(df.shape[0]),
        "n_transactions": int(basket_df.shape[0]),
        "n_unique_items": int(basket_df.shape[1]),
        "tx_column": tx_col,
        "item_column": item_col,
        "min_support_requested": INITIAL_MIN_SUPPORT,
        "min_support_used": float(used_support),
        "min_confidence": MIN_CONFIDENCE,
        "files": {
            "processed": str(processed_path),
            "pkl": str(pkl_path),
            "rules_json": str(out_dir / "association_rules.json"),
            "metadata_yaml": str(out_dir / "metadata.yaml")
        }
    }
    yaml_path = out_dir / "metadata.yaml"
    save_yaml(metadata, str(yaml_path))

    # Save association rules JSON (human-friendly)
    json_path = out_dir / "association_rules.json"
    save_json(rules_serial, str(json_path))

    print("All artifacts generated successfully.")
    print(f"Files written to: {out_dir}")

    if len(rules_serial) > 0:
        print("\nTop 10 rules (by lift):")
        for r in rules_serial[:10]:
            print(f"{r['antecedents']} -> {r['consequents']} (support={r['support']:.4f}, conf={r['confidence']:.4f}, lift={r['lift']:.4f})")
    else:
        print("No rules generated with the chosen thresholds.")

if __name__ == "__main__":
    main()

# ------------------ Example: how to load and use the recommender ------------------
# After running the above script, in a separate Python session you can do:
#
# import joblib
# from generate_artifacts_with_serial_rules import make_recommender_from_rules_list
#
# artifacts = joblib.load(r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection\artifacts.pkl")
# rules_serial = artifacts.get('rules_serial', [])
# recommend, internal_rules = make_recommender_from_rules_list(rules_serial)
#
# # Example usage:
# print(recommend(["whole milk"], top_n=5, metric='lift'))
# print(recommend(["sausage", "brown bread"], top_n=5))
#
# You can also log queries by writing to a CSV from the session that calls recommend().


Loading dataset...
Dataset loaded. Shape: (38765, 3). Columns: ['Member_number', 'Date', 'itemDescription']
Guessed transaction column: Date, item column: itemDescription
Preprocessing transactions and building basket matrix...


  first_dtype = basket_df.dtypes[0]
  if pd.api.types.is_sparse(first_dtype):


Built basket matrix. Transactions: 14963, Unique items: 167
Saved HDF5 via h5py: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\processed_data.h5
[mining] Attempt with min_support=0.0100
[prune] Transactions: 14963, Items after prune: 64
Mining finished (used_min_support=0.0100). Found 69 frequent_itemsets and 0 rules.
Saved PKL: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\artifacts.pkl
Saved YAML: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\metadata.yaml
Saved JSON: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\association_rules.json
All artifacts generated successfully.
Files written to: C:\Users\NXTWAVE\Downloads\Groceries Price Detection
No rules generated with the chosen thresholds.
