In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
import datetime
from pathlib import Path
import joblib

# -------------------------------
# Preprocess transactions
# -------------------------------
def preprocess_transactions(df: pd.DataFrame, tx_col: str, item_col: str):
    df[item_col] = df[item_col].astype(str)
    transactions = df.groupby(tx_col)[item_col].apply(list).tolist()

    mlb = MultiLabelBinarizer()
    basket = mlb.fit_transform(transactions)
    basket_df = pd.DataFrame(basket, columns=mlb.classes_)
    return transactions, basket_df, mlb

# -------------------------------
# Run FP-Growth & generate rules
# -------------------------------
def run_fp_growth_and_rules(basket_df: pd.DataFrame, min_support=0.005, min_confidence=0.1):
    bool_df = basket_df.astype(bool)
    frequent_itemsets = fpgrowth(bool_df, min_support=min_support, use_colnames=True)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    return frequent_itemsets, rules

# -------------------------------
# Convert rules to serializable format
# -------------------------------
def rules_to_serializable(rules_df: pd.DataFrame):
    rules_list = []
    for _, row in rules_df.iterrows():
        rules_list.append({
            'antecedents': list(row['antecedents']),
            'consequents': list(row['consequents']),
            'support': row['support'],
            'confidence': row['confidence'],
            'lift': row['lift']
        })
    return rules_list

# -------------------------------
# Build recommender (Option A)
# -------------------------------
def build_recommender(rules_df: pd.DataFrame, basket_df: pd.DataFrame):
    rules_list = rules_to_serializable(rules_df)
    item_freq = basket_df.sum().sort_values(ascending=False)
    return {"rules": rules_list, "item_freq": item_freq.tolist(), "columns": list(basket_df.columns)}

# -------------------------------
# Recommendation function
# -------------------------------
def recommend(given_items, recommender_data, top_n=5):
    rules_list = recommender_data["rules"]
    item_freq = pd.Series(recommender_data["item_freq"], index=recommender_data["columns"])
    given = set(given_items)
    scores = {}
    
    # Score items based on rules
    for r in rules_list:
        if set(r['antecedents']).issubset(given):
            for c in r['consequents']:
                scores[c] = max(scores.get(c, 0), r.get('lift', 0))
    
    # Remove already purchased items
    for g in list(given):
        scores.pop(g, None)
    
    # If not enough recommendations, fill with frequent items
    if len(scores) < top_n:
        for item in item_freq.index:
            if item not in scores and item not in given:
                scores[item] = 0.01
            if len(scores) >= top_n:
                break

    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, sc in sorted_items[:top_n]]

# -------------------------------
# Save pickle safely
# -------------------------------
def save_pickle(obj, out_path: str):
    joblib.dump(obj, out_path)
    print(f"Saved PKL: {out_path}")

# -------------------------------
# Main script
# -------------------------------
def main():
    INPUT_CSV = r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection\archive (1)\Groceries_dataset.csv"
    out_dir = Path(r"C:\Users\NXTWAVE\Downloads\Groceries Price Detection")
    out_dir.mkdir(parents=True, exist_ok=True)
    
    print("Loading dataset...")
    df = pd.read_csv(INPUT_CSV)
    print(f"Dataset loaded. Shape: {df.shape}")
    
    tx_col = 'Date'
    item_col = 'itemDescription'
    print(f"Transaction column: {tx_col}, Item column: {item_col}")
    
    transactions, basket_df, mlb = preprocess_transactions(df, tx_col, item_col)
    print(f"Transactions: {len(transactions)}, Unique items: {basket_df.shape[1]}")
    
    # Save processed basket
    processed_path = out_dir / "processed_data.h5"
    basket_df.to_hdf(processed_path, key='basket', mode='w', format='table')
    print(f"Saved HDF5: {processed_path}")
    
    print("Mining frequent itemsets and rules...")
    frequent_itemsets, rules = run_fp_growth_and_rules(basket_df)
    print(f"Found {len(frequent_itemsets)} frequent itemsets, {len(rules)} rules")
    
    recommender = build_recommender(rules, basket_df)
    
    # Test recommendation
    sample_items = ['tropical fruit', 'whole milk']
    recs = recommend(sample_items, recommender, top_n=5)
    print(f"Given items: {sample_items}")
    print(f"Recommended items: {recs}")
    
    # Save artifacts
    artifacts = {
        "mlb": mlb,
        "basket_df_index": list(basket_df.index),
        "frequent_itemsets": frequent_itemsets,
        "rules": rules,
        "recommender_data": recommender,
        "raw_dataframe_head": df.head(200)
    }
    pkl_path = out_dir / "artifacts.pkl"
    save_pickle(artifacts, str(pkl_path))
    
    print("Done.")

# -------------------------------
if __name__ == "__main__":
    main()


Loading dataset...
Dataset loaded. Shape: (38765, 3)
Transaction column: Date, Item column: itemDescription
Transactions: 728, Unique items: 167
Saved HDF5: C:\Users\NXTWAVE\Downloads\Groceries Price Detection\processed_data.h5
Mining frequent itemsets and rules...
