In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   ----------------------- ---------------- 0.8/1.4 MB 1.9 MB/s eta 0:00:01
   ------------------------------- -------- 1.0/1.4 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 1.6 MB/s  0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4


In [6]:
!pip install tables


Collecting tables
  Downloading tables-3.10.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting numexpr>=2.6.2 (from tables)
  Downloading numexpr-2.12.1-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting py-cpuinfo (from tables)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting blosc2>=2.3.0 (from tables)
  Downloading blosc2-3.8.0-cp311-cp311-win_amd64.whl.metadata (7.1 kB)
Collecting ndindex (from blosc2>=2.3.0->tables)
  Downloading ndindex-1.10.0-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Collecting msgpack (from blosc2>=2.3.0->tables)
  Downloading msgpack-1.1.1-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Downloading tables-3.10.2-cp311-cp311-win_amd64.whl (6.4 MB)
   ---------------------------------------- 0.0/6.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/6.4 MB ? eta -:--:--
   --- ------------------------------------ 0.5/6.4 MB 1.5 MB/s eta 0:00:04
   ------ --------------------------------- 1.0/6.4 MB 1.9 MB/s et

In [7]:
import os
import json
import datetime
from pathlib import Path

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
import yaml

# ---------- USER PATHS (as requested) ----------
INPUT_CSV = r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection\archive (1)\Groceries_dataset.csv"
OUTPUT_DIR = r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection"
# ------------------------------------------------

def ensure_output_dir(path):
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p

def guess_columns(df: pd.DataFrame):
    """
    Try to guess which columns are transaction id and item description columns.
    Returns (transaction_col, item_col). If uncertain, falls back to first two object columns.
    """
    object_cols = [c for c, dt in df.dtypes.items() if pd.api.types.is_object_dtype(dt) or pd.api.types.is_string_dtype(dt)]
    lower = [c.lower() for c in object_cols]

    # common names
    tx_candidates = []
    item_candidates = []

    tx_names = ["invoice", "invoiceNo", "invoiceno", "transaction", "orderid", "order_id", "basket", "receipt", "id"]
    item_names = ["item", "itemDescription", "product", "productDescription", "item_description", "itemdesc", "item_name", "description"]

    for col in object_cols:
        cl = col.lower()
        if any(name.lower() in cl for name in tx_names):
            tx_candidates.append(col)
        if any(name.lower() in cl for name in item_names):
            item_candidates.append(col)

    # If none found, fallback to first two object columns
    tx_col = tx_candidates[0] if tx_candidates else (object_cols[0] if len(object_cols) >= 1 else None)
    item_col = item_candidates[0] if item_candidates else (object_cols[1] if len(object_cols) >= 2 else (object_cols[0] if object_cols else None))

    return tx_col, item_col

def load_dataset(csv_path: str):
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Input CSV not found: {csv_path}")
    # try reading with default, if fails try latin-1
    try:
        df = pd.read_csv(csv_path)
    except Exception:
        df = pd.read_csv(csv_path, encoding='latin-1')
    return df

def preprocess_transactions(df: pd.DataFrame, tx_col: str, item_col: str):
    """
    Build transactions list and a one-hot encoded basket DataFrame.
    Returns:
      - transactions: list of sets/items per transaction
      - basket_df: one-hot encoded DataFrame, rows=transactions, cols=items (0/1)
      - tx_index: list of transaction ids (index aligned with basket_df)
    """
    # drop rows with missing item
    df = df.copy()
    df[item_col] = df[item_col].astype(str).str.strip()
    df = df[df[item_col].notna() & (df[item_col] != '')]
    # If tx_col is missing, treat each row as a transaction
    if tx_col is None:
        df['_TX_AUTO_'] = range(len(df))
        tx_col = '_TX_AUTO_'
    # Group by transaction id -> list of unique items
    grouped = df.groupby(tx_col)[item_col].apply(lambda s: list(pd.Series(s).astype(str).str.strip().unique()))
    transactions = grouped.tolist()
    tx_index = grouped.index.tolist()

    # MultiLabelBinarizer to one-hot
    mlb = MultiLabelBinarizer(sparse_output=False)
    basket = mlb.fit_transform(transactions)
    basket_df = pd.DataFrame(basket, index=tx_index, columns=mlb.classes_)
    # Convert to 0/1 int
    basket_df = basket_df.astype(int)
    return transactions, basket_df, mlb

def run_apriori_and_rules(basket_df: pd.DataFrame, min_support=0.01, min_confidence=0.2):
    """
    Run Apriori and derive association rules.
    Returns frequent_itemsets (DataFrame) and rules (DataFrame).
    """
    # apriori expects booleans or 0/1; ensure boolean
    bool_df = basket_df.astype(bool)
    frequent_itemsets = apriori(bool_df, min_support=min_support, use_colnames=True)
    # add length of itemset
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    # generate rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    # sort rules
    if not rules.empty:
        rules = rules.sort_values(['lift', 'confidence', 'support'], ascending=[False, False, False])
    return frequent_itemsets, rules

def save_hdf5(df: pd.DataFrame, out_path: str, key='data'):
    # pandas HDF5
    df.to_hdf(out_path, key=key, mode='w', format='table')
    print(f"Saved HDF5: {out_path}")

def save_pickle(obj, out_path: str):
    joblib.dump(obj, out_path)
    print(f"Saved PKL: {out_path}")

def save_yaml(data: dict, out_path: str):
    with open(out_path, 'w', encoding='utf-8') as f:
        yaml.safe_dump(data, f, sort_keys=False)
    print(f"Saved YAML: {out_path}")

def save_json(data, out_path: str):
    # data should be JSON serializable
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, default=str)
    print(f"Saved JSON: {out_path}")

def rules_to_serializable(rules_df: pd.DataFrame):
    """Convert rules DataFrame into a JSON-serializable list of dicts (top fields)."""
    if rules_df.empty:
        return []
    serial = []
    for _, row in rules_df.iterrows():
        serial.append({
            "antecedents": sorted(list(row['antecedents'])) if hasattr(row['antecedents'], '__iter__') else [row['antecedents']],
            "consequents": sorted(list(row['consequents'])) if hasattr(row['consequents'], '__iter__') else [row['consequents']],
            "support": float(row.get('support', np.nan)),
            "confidence": float(row.get('confidence', np.nan)),
            "lift": float(row.get('lift', np.nan)),
            "leverage": float(row.get('leverage', np.nan)) if 'leverage' in row else None,
            "conviction": float(row.get('conviction', np.nan)) if 'conviction' in row else None
        })
    return serial

def build_recommender(rules_df: pd.DataFrame):
    """
    Build a simple item-based recommender function using rules DataFrame.
    The returned object is a dict with a function 'recommend' (callable) and stored rules.
    recommend(items, top_n=5, metric='lift') -> list of recommended items
    """
    # Simplified rules aggregation: for each antecedent set, map to consequents with scores
    rules_list = rules_to_serializable(rules_df)

    def recommend(given_items, top_n=5, metric='lift'):
        if isinstance(given_items, str):
            given = set([given_items])
        else:
            given = set(given_items)
        scores = {}
        for r in rules_list:
            ant = set(r['antecedents'])
            if ant.issubset(given):
                for c in r['consequents']:
                    # score by chosen metric (lift or confidence)
                    scores[c] = max(scores.get(c, 0), r.get(metric, 0))
        # remove items already in given
        for g in list(given):
            scores.pop(g, None)
        # return top_n sorted by score
        sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, sc in sorted_items[:top_n]]

    recommender_obj = {
        "rules": rules_list,
        "recommend": recommend  # note: function not JSON serializable but will be pickled
    }
    return recommender_obj

def main():
    out_dir = ensure_output_dir(OUTPUT_DIR)
    print("Loading dataset...")
    df = load_dataset(INPUT_CSV)
    print(f"Dataset loaded. Shape: {df.shape}. Columns: {list(df.columns)}")

    tx_col, item_col = guess_columns(df)
    print(f"Guessed transaction column: {tx_col}, item column: {item_col}")

    if tx_col is None or item_col is None:
        raise ValueError("Could not determine transaction and/or item columns. Please inspect dataset and adjust script.")

    print("Preprocessing transactions and building basket matrix...")
    transactions, basket_df, mlb = preprocess_transactions(df, tx_col, item_col)
    print(f"Built basket matrix. Transactions: {len(transactions)}, Unique items: {basket_df.shape[1]}")

    # Save HDF5 of the processed basket (it can be large; consider compression if needed)
    h5_path = out_dir / "processed_data.h5"
    save_hdf5(basket_df, str(h5_path), key='basket')

    # Run Apriori
    print("Running Apriori frequent itemset mining and association rules...")
    frequent_itemsets, rules = run_apriori_and_rules(basket_df, min_support=0.01, min_confidence=0.2)
    print(f"Found {len(frequent_itemsets)} frequent itemsets and {len(rules)} rules.")

    # Build recommender object
    recommender = build_recommender(rules)

    # Save PKL: include useful objects
    artifacts = {
        "mlb": mlb,
        "basket_df_index": list(basket_df.index),
        "basket_columns": list(basket_df.columns),
        "frequent_itemsets": frequent_itemsets,
        "rules": rules,
        "recommender": recommender,
        "raw_dataframe_head": df.head(100)  # small snapshot
    }
    pkl_path = out_dir / "artifacts.pkl"
    save_pickle(artifacts, str(pkl_path))

    # Save YAML metadata/config
    metadata = {
        "source_csv": INPUT_CSV,
        "generated_on": datetime.datetime.now().isoformat(),
        "n_rows_raw": int(df.shape[0]),
        "n_transactions": int(basket_df.shape[0]),
        "n_unique_items": int(basket_df.shape[1]),
        "tx_column": tx_col,
        "item_column": item_col,
        "apriori_min_support": 0.01,
        "apriori_min_confidence": 0.2,
        "files": {
            "hdf5": str(h5_path),
            "pkl": str(pkl_path),
            "rules_json": str(out_dir / "association_rules.json"),
            "metadata_yaml": str(out_dir / "metadata.yaml")
        }
    }
    yaml_path = out_dir / "metadata.yaml"
    save_yaml(metadata, str(yaml_path))

    # Save association rules to JSON
    rules_json = rules_to_serializable(rules)
    json_path = out_dir / "association_rules.json"
    save_json(rules_json, str(json_path))

    print("All artifacts generated successfully.")
    print(f"Files written to: {out_dir}")

    # print a tiny sample of top rules
    if len(rules_json) > 0:
        print("\nTop 10 rules (by lift):")
        for r in rules_json[:10]:
            print(f"{r['antecedents']} -> {r['consequents']} (support={r['support']:.4f}, conf={r['confidence']:.4f}, lift={r['lift']:.4f})")
    else:
        print("No rules generated with the chosen support/confidence thresholds. Consider lowering min_support/min_confidence in script.")

if __name__ == "__main__":
    main()


Loading dataset...
Dataset loaded. Shape: (38765, 3). Columns: ['Member_number', 'Date', 'itemDescription']
Guessed transaction column: Date, item column: itemDescription
Preprocessing transactions and building basket matrix...
Built basket matrix. Transactions: 728, Unique items: 167
Saved HDF5: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\processed_data.h5
Running Apriori frequent itemset mining and association rules...


MemoryError: Unable to allocate 126. GiB for an array with shape (37169919, 5, 728) and data type bool