In [None]:


import pandas as pd
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import os
from datetime import datetime
import asyncio
import json

import gc
import tensorflow as tf
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from temporal_features import TemporalFeatures
from holiday_features import HolidayFeatures
from wallmart_rcpt_parser import WallmartRecptParser
from winn_dixie_recpt_parser import WinnDixieRecptParser 
from hidden_layer_param_builder import HiddenLayerParamSetBuilder
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

In [None]:


def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)



def canonicalize_items(df, patterns, canonical_name):
    """
    For each pattern in `patterns`, find rows where `item` contains the pattern
    and replace df['item'] with `canonical_name`.
    """
    for p in patterns:
        mask = df["item"].str.contains(p, case=False, na=False)
        df.loc[mask, "item"] = canonical_name


In [None]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")

#grouped.to_csv("grouped.csv", index=False)
#grouped.info()


In [None]:
import os
import pandas as pd

def ImportWallMart(folder_path: str) -> pd.DataFrame:
    """
    Import all Walmart receipt CSV files from a folder.
    Adds a 'source' column set to the CSV filename.
    """
    dataframes = []

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            dataframe = pd.read_csv(file_path)
            dataframe["source"] = file_name
            dataframes.append(dataframe)

    if len(dataframes) == 0:
        return pd.DataFrame()

    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    return combined_dataframe


In [None]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"]
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

winndixie_df = pd.DataFrame(rows)

winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])
winndixie_df["time"] = winndixie_df["time"].astype(str)

winndixie_df = WinnDixieRecptParser.remove_duplicate_receipt_files(winndixie_df)

winndixie_df = winndixie_df.sort_values(by=["date", "time"]).reset_index(drop=True)
winndixie_df = winndixie_df.drop(columns=["time"])

In [None]:

wallmart_raw = WallmartRecptParser.ImportWallMart("./walmart")

## rename cols
wallmart_df = wallmart_raw[["Order Date","Product Description", "source"]].copy()
wallmart_df = wallmart_df.rename(columns={
    "Order Date": "date",
    "Product Description": "item"
})

wallmart_df["date"] = pd.to_datetime(wallmart_df["date"])
winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])

combined_df = pd.concat(
    [winndixie_df, wallmart_df[["date", "item", "source"]]],
    ignore_index=True
)

# remove - 
combined_df["item"] = (combined_df["item"]
        .str.replace(r"^\s*[-–—]\s*", "", regex=True)
        .str.strip()
)



In [None]:
milk_patterns = ["know-and-love-milk", "kandl-milk", "prairie-farm-milk","kleinpeter-milk", "kl-milk", "Milk, Fat Free,", "Fat-Free Milk"]
canonicalize_items(combined_df, milk_patterns, "milk")

bread_patterns = ["bunny-bread","se-grocers-bread","seg-sandwich-bread", "seg-white-bread"]
canonicalize_items(combined_df, bread_patterns, "bread")

cheese_patterns = ["dandw-cheese", "kraft-cheese", "se-grocers-cheese", "know-and-love-cheese"]
canonicalize_items(combined_df, cheese_patterns, "cheese")

mayo_patterns = ["blue-plate-mayo", "blue-plate-mynnase"]
canonicalize_items(combined_df, mayo_patterns, "mayo")

chicken_patterns = ["chicken-cutlet", "chicken-leg", "chicken-thigh", "chicken-thighs"]
canonicalize_items(combined_df, chicken_patterns, "chicken")

yogurt_patterns = ["chobani-yogrt-flip", "chobani-yogurt"]
canonicalize_items(combined_df, yogurt_patterns, "yogurt")

coke_patterns = ["coca-cola", "coca-cola-cola", "cocacola-soda"]
canonicalize_items(combined_df, coke_patterns, "coke")

hugbi_patterns = ["hugbi-pies", "-hugbi-pies"]
canonicalize_items(combined_df, hugbi_patterns, "hugbi-pies")

minute_maid_patterns = ["minute-maid-drink", "minute-maid-drinks", "minute-maid-lmnade"]
canonicalize_items(combined_df, minute_maid_patterns, "minute-maid-drink")



In [None]:
### CREATE ITEM IDs
unique_items = sorted(combined_df["item"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
combined_df["itemId"] = combined_df["item"].map(item_to_id)
combined_df.reset_index(drop=True, inplace=True)
combined_df.info()
combined_df.head(100)

In [None]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
combined_df["didBuy"] = 1

# 2. Build complete grid
all_items = combined_df["itemId"].unique()
all_dates = combined_df["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    combined_df[["date", "itemId", "item", "source", "didBuy"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy"] = df_full["didBuy"].fillna(0).astype(int)

# 5. NOW REPLACE combined_df with df_full
combined_df = df_full.copy()


In [None]:
# 1. Build grouped table (one row per trip date)

grouped = ( combined_df[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)

grouped["daysSinceLastTrip"] = TemporalFeatures.DaysSinceLastTrip(grouped)
grouped["avgDaysBetweenTrips"] = TemporalFeatures.AvgDaysBetweenTrips(grouped)

# 3. Holiday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(HolidayFeatures.daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(HolidayFeatures.daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(HolidayFeatures.holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(HolidayFeatures.daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(HolidayFeatures.daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(HolidayFeatures.schoolSeasonIndex)


grouped = TemporalFeatures.CreateDateFeatures(grouped)

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

combined_df = combined_df.merge(grouped, on="date", how="left")
combined_df.info()
combined_df.head(10)

In [None]:
# ================================================
# FREQUENCY WINDOWS (7, 15, 30, 90, 365)
# True rolling-window implementation
# ================================================
def fill_freq(group):
    group = group.copy()
    group = group.sort_values("date").reset_index(drop=True)

    history = []

    col_date = group.columns.get_loc("date")
    col_buy = group.columns.get_loc("didBuy")
    col_freq = {w: group.columns.get_loc(f"freq_{w}") for w in freq_windows}

    for i in range(len(group)):
        cur_date = group.iat[i, col_date]

        # record purchase
        if group.iat[i, col_buy] == 1:
            history.append(cur_date)

        # prune history ONCE using largest window
        cutoff_max = cur_date - pd.Timedelta(days=max_w)
        history = [d for d in history if d >= cutoff_max]

        # compute windowed counts
        for w in freq_windows:
            cutoff = cur_date - pd.Timedelta(days=w)
            count = 0
            for d in history:
                if d >= cutoff:
                    count += 1
            group.iat[i, col_freq[w]] = count

    return group
#######################################################
freq_windows = [7, 15, 30, 90, 365]
max_w = max(freq_windows)

# initialize columns
for w in freq_windows:
    combined_df[f"freq_{w}"] = np.nan

combined_df = (
    combined_df
    .groupby("itemId", group_keys=False)
    .apply(fill_freq)
)


In [None]:
# ============================================================
# INCREASING DAILY daysSinceLastPurchase (resets on purchase)
# ============================================================
def fill_item(group):
    group = group.copy()
    # iterate row-by-row using positional index
    for i in range(1, len(group)):
        if pd.isna(group.iat[i, group.columns.get_loc("daysSinceLastPurchase")]):
            prev_val = group.iat[i-1, group.columns.get_loc("daysSinceLastPurchase")]
            trip_gap = group.iat[i, group.columns.get_loc("daysSinceLastTrip")]
            group.iat[i, group.columns.get_loc("daysSinceLastPurchase")] = prev_val + trip_gap
    return group
##########################################################################################

combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

# Start with NaN everywhere
combined_df["daysSinceLastPurchase"] = np.nan

# Set 0 on purchase days
combined_df.loc[combined_df["didBuy"] == 1, "daysSinceLastPurchase"] = 0
combined_df = combined_df.groupby("itemId", group_keys=False).apply(fill_item)

# Items with no purchase history get 999
combined_df["daysSinceLastPurchase"] = combined_df["daysSinceLastPurchase"].fillna(999)

In [None]:
# ============================================================
# ITEM-LEVEL HABIT FEATURES (TF-IDF ANALOG)
# ============================================================
def build_habit_features(df, tau_days=120):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    total_trips = df["date"].nunique()
    timeline_days = (df["date"].max() - df["date"].min()).days or 1

    rows = []

    for itemId, g in df.groupby("itemId"):
        buys = g[g["didBuy"] == 1]["date"]

        if len(buys) == 0:
            rows.append({
                "itemId": itemId,
                "habitFrequency": 0.0,
                "habitSpan": 0.0,
                "habitDecay": 0.0,
            })
            continue

        first = buys.min()
        last = buys.max()

        habitFrequency = len(buys) / total_trips
        habitSpan = (last - first).days / timeline_days
        days_since_last = (df["date"].max() - last).days
        habitDecay = np.exp(-days_since_last / tau_days)

        rows.append({
            "itemId": itemId,
            "habitFrequency": habitFrequency,
            "habitSpan": habitSpan,
            "habitDecay": habitDecay,
        })

    return pd.DataFrame(rows)
###############################################################################


def compute_due_score(df,itemId=None,use_sigmoid=True,normalize=False, weights=None):
    """
    Compute due_score from RAW (non-normalized) features.

    Required columns:
      - itemId
      - daysSinceLastPurchase
      - freq_30
      - freq_90

    Parameters
    ----------
    df : DataFrame

    itemId : int | None
        If provided, compute only for this itemId.
        If None, compute for all items.

    use_sigmoid : bool
        Apply sigmoid → (0,1)

    normalize : bool
        Z-normalize instead (ignored if use_sigmoid=True)

    weights : dict | None
        Optional override for feature weights
    """

    if weights is None:
        weights = {
            "daysSinceLastPurchase": 1.5,
            "freq_30": 1.0,
            "freq_90": 0.5
        }

    # --------------------------------------------------------
    # Optional itemId filter
    # --------------------------------------------------------
    if itemId is not None:
        df = df[df["itemId"] == itemId].copy()
    else:
        df = df.copy()

    # --------------------------------------------------------
    # RAW linear score (pre-normalization)
    # --------------------------------------------------------
    df["due_score_raw"] = (
        weights["daysSinceLastPurchase"] * df["daysSinceLastPurchase"]
      + weights["freq_30"]              * df["freq_30"]
      + weights["freq_90"]              * df["freq_90"]
    )

    # --------------------------------------------------------
    # Final due_score
    # --------------------------------------------------------
    if use_sigmoid:
        df["due_score"] = 1 / (1 + np.exp(-df["due_score_raw"]))

    elif normalize:
        mean = df["due_score_raw"].mean()
        std  = df["due_score_raw"].std() or 1.0
        df["due_score"] = (df["due_score_raw"] - mean) / std

    else:
        df["due_score"] = df["due_score_raw"]

    return df
###############################################################################


# ============================================================
# MERGE HABIT FEATURES
# ============================================================
habit_df = build_habit_features(combined_df)

combined_df = combined_df.merge(habit_df, on="itemId",how="left")

combined_df[["habitFrequency", "habitSpan", "habitDecay"]] = (
    combined_df[["habitFrequency", "habitSpan", "habitDecay"]].fillna(0.0)
)



In [None]:
## trim fat
# find rows with freq_365 of 1 or less

In [None]:
# ============================================================
# NORMALIZE TO ENCODED_DF
# ============================================================

freq_cols = [c for c in combined_df.columns if c.startswith("freq_")]
weather_cols = [c for c in combined_df.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in combined_df.columns if "holiday" in c.lower()]
school_cols = [c for c in combined_df.columns if "school" in c.lower()]

daysSince_purchase_cols = ["daysSinceLastPurchase"]
daysSince_trip_cols     = ["daysSinceLastTrip"]

habit_cols = ["habitFrequency", "habitSpan", "habitDecay"]

encoded_df = combined_df.copy()
encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_purchase_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_trip_cols)
encoded_df = normalizeAndDropCols(encoded_df, habit_cols)

encoded_df.info()
encoded_df.head(100)


In [None]:
# ---------- CYCLICAL FEATURES ----------
encoded_df["dow_sin"]   = np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"]   = np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["doy_sin"]   = np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"]   = np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["dow", "month", "doy"], errors="ignore")

# ---------- NON-CYCLIC TIME FEATURES ----------
nonCycCols = ["year", "day", "quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)

# ---------- DROP NON-MODEL COLS ----------
cols_to_drop = ["source", "item", "date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

encoded_df.info()

In [None]:
encoded_df["due_score"] = (
    1.0 * encoded_df["daysSinceLastPurchase_norm"]
  + 1.0 * encoded_df["freq_30_norm"]
  ##+ 0.5 * encoded_df["freq_90_norm"]
)

encoded_df["due_score"] = 1 / (1 + np.exp(-encoded_df["due_score"]))

encoded_df.info()
encoded_df.head()

# TRAIN / BUILD MODEL

In [None]:
tf.__version__

In [None]:
def export_df(dataframes, dir):
    for name, df in dataframes.items():
        csv_path = os.path.join(dir, f"{name}.csv")
        df.to_csv(csv_path, index=True)
##################################################################################

def save_experiment( model, history,  dataframes,  build_params, train_params, numeric_cols,item_id_to_idx,base_dir="experiments"):
    name_parts = []

    if "embedding_dim" in build_params:
        name_parts.append(f"emb{build_params['embedding_dim']}")

    if "hiddenLayers" in build_params:
        hl = "-".join(str(x) for x in build_params["hiddenLayers"])
        name_parts.append(f"hl{hl}")

    if "epochs" in train_params:
        name_parts.append(f"ep{train_params['epochs']}")

    exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
    exp_dir = os.path.join(base_dir, exp_name)
    os.makedirs(exp_dir, exist_ok=True)

    export_df(dataframes, exp_dir)
    # ------------------------------------------------------------
    # Save model
    # ------------------------------------------------------------
    model.save(os.path.join(exp_dir, "model"))
    model.save_weights(os.path.join(exp_dir, "weights.h5"))

    # ------------------------------------------------------------
    # Save history
    # ------------------------------------------------------------
    history_path = os.path.join(exp_dir, "history.json")
    history_file = open(history_path, "w")
    json.dump(history.history, history_file, indent=2)
    history_file.close()

    # ------------------------------------------------------------
    # Save numeric features
    # ------------------------------------------------------------
    numeric_path = os.path.join(exp_dir, "numeric_features.json")
    numeric_file = open(numeric_path, "w")
    json.dump(numeric_cols, numeric_file, indent=2)
    numeric_file.close()

    # ------------------------------------------------------------
    # Save item index mapping
    # ------------------------------------------------------------
    item_map_path = os.path.join(exp_dir, "item_id_to_idx.json")
    item_map_file = open(item_map_path, "w")
    json.dump(
        {str(int(k)): int(v) for k, v in item_id_to_idx.items()},
        item_map_file,
        indent=2
    )
    item_map_file.close()

    # ------------------------------------------------------------
    # Save params
    # ------------------------------------------------------------
    build_params_path = os.path.join(exp_dir, "build_params.json")
    build_params_file = open(build_params_path, "w")
    json.dump(build_params, build_params_file, indent=2)
    build_params_file.close()

    train_params_path = os.path.join(exp_dir, "train_params.json")
    train_params_file = open(train_params_path, "w")
    json.dump(train_params, train_params_file, indent=2)
    train_params_file.close()

    # ------------------------------------------------------------
    # Save predictions
    # ------------------------------------------------------------
    ##predictions.to_csv(os.path.join(exp_dir, "predictions.csv"), index=False)

    print("Saved experiment →", exp_dir)
##########################################################################################

def build_and_compile_model(num_numeric_features, num_items, params):
    num_in = layers.Input(shape=(num_numeric_features,))
    item_in = layers.Input(shape=(), dtype="int32")

    emb = layers.Embedding(
        input_dim=num_items,
        output_dim=params["embedding_dim"]
    )(item_in)

    x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

    for units in params["hiddenLayers"]:
        x = layers.Dense(units, activation="relu")(x)

    out = layers.Dense(
        1,
        activation=params.get("output_activation", "sigmoid")
    )(x)

    model = models.Model([num_in, item_in], out)

    optimizer_name = params.get("optimizer", "adam")
    learning_rate = params.get("learning_rate", 0.001)

    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = optimizer_name

    model.compile(
        optimizer=optimizer,
        loss=params.get("loss", "mse"),
        metrics=params.get("metrics", ["mae"])
    )

    return model
##########################################################################################

def train_model(model, encoded_df, params):
   
    numeric_cols = [
        c for c in encoded_df.columns
        if c.endswith("_norm") and c != "due_score"
    ]

    Xn = encoded_df[numeric_cols].to_numpy(np.float32)
    Xi = encoded_df["itemIdx"].to_numpy(np.int32)
    targetVar  = encoded_df["due_score"].to_numpy(np.float32)

    Xn_tr, Xn_te, Xi_tr, Xi_te, targetVar_tr, y_te = train_test_split(Xn, Xi, targetVar, test_size=0.2, random_state=42)

    history = model.fit(
        [Xn_tr, Xi_tr],
        targetVar_tr,
        validation_split=0.1,
        epochs=params["epochs"],
        batch_size=32,
        verbose=1
    )

    return history
##########################################################################################

def run_predictions( model, encoded_df, combined_df, feature_stats, predict_date=None):
    """
    Build one prediction row per item using:
    - latest encoded feature state (encoded_df)
    - raw timeline + names (combined_df)
    - recomputed calendar features at predict_date
    """

    if predict_date is None:
        predict_date = pd.Timestamp.today().normalize()
    else:
        predict_date = pd.to_datetime(predict_date).normalize()

    # --------------------------------------------------------
    # Discover numeric features (single source: encoded_df)
    # --------------------------------------------------------
    numeric_cols = [
        c for c in encoded_df.columns
        if c.endswith("_norm") and c != "due_score"
    ]

    # --------------------------------------------------------
    # Lookups from combined_df (single source of truth)
    # --------------------------------------------------------
    last_date_by_item = (
        combined_df
        .sort_values("date")
        .groupby("itemId")["date"]
        .last()
    )

    item_lookup = (
        combined_df[["itemId", "item"]]
        .drop_duplicates()
        .set_index("itemId")["item"]
        .to_dict()
    )

    rows = []

    for itemId, hist in encoded_df.groupby("itemId"):
        last = hist.iloc[-1]
        last_date = pd.to_datetime(last_date_by_item.loc[itemId]).normalize()

        row = {
            "itemId": itemId,
            "item": item_lookup.get(itemId, "UNKNOWN"),
            "itemIdx": int(last["itemIdx"])
        }

        # ----------------------------------------------------
        # Copy model-stable numeric features (already normalized)
        # ----------------------------------------------------
        for col in numeric_cols:
            row[col] = last[col]

        # ----------------------------------------------------
        # Recompute DATE-SENSITIVE features
        # ----------------------------------------------------
        raw_updates = {
            "daysSinceLastPurchase": (predict_date - last_date).days,
            "daysUntilNextHoliday": daysUntilNextHoliday(predict_date),
            "daysSinceLastHoliday": daysSinceLastHoliday(predict_date),
            "holidayProximityIndex": holidayProximityIndex(predict_date),
            "daysUntilSchoolStart": daysUntilSchoolStart(predict_date),
            "daysUntilSchoolEnd": daysUntilSchoolEnd(predict_date),
            "schoolSeasonIndex": schoolSeasonIndex(predict_date),
            "year": predict_date.year,
            "day": predict_date.day,
            "quarter": predict_date.quarter
        }

  
        # ----------------------------------------------------
        # Normalize recomputed features
        # ----------------------------------------------------
        for raw, val in raw_updates.items():
            norm_col = raw + "_norm"
            if norm_col in numeric_cols and raw in feature_stats:
                stats = feature_stats[raw]
                row[norm_col] = (val - stats["mean"]) / stats["std"]

        rows.append(row)

    pred_df = pd.DataFrame(rows)

    Xn = pred_df[numeric_cols].to_numpy(np.float32)
    Xi = pred_df["itemIdx"].to_numpy(np.int32)

    scores = model.predict([Xn, Xi], verbose=0).ravel()

    pred_df["due_intensity"] = scores

    return (
        pred_df[["itemId", "item", "due_intensity"]]
        .sort_values("due_intensity", ascending=False)
        .reset_index(drop=True)
    )
###############################################################################

def BuildParamSets( baseline_params, property_name, start, step, stop):
    """
    Creates multiple fully independent parameter dictionaries by varying one property.
    Each iteration produces a brand-new baseline object.
    """
    import copy
    results = []

    value = start
    while value <= stop:
        params_copy = copy.deepcopy(baseline_params)
        params_copy[property_name] = value
        results.append(params_copy)
        value += step

    return results
###############################################################################

def runExp(combined_df, encoded_df, buildParams, trainParams, baseDir):
    #
    # item index
    item_ids = sorted(encoded_df["itemId"].unique())
    item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
    encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
    num_items = len(item_ids)
    #
    numeric_cols = [
        c for c in encoded_df.columns
        if c.endswith("_norm") and c != "due_score"
    ]
    num_numeric_features = len(numeric_cols)
    #
    feature_stats = {}
    RECOMPUTED = [
        "daysSinceLastPurchase",
        "daysUntilNextHoliday",
        "daysSinceLastHoliday",
        "holidayProximityIndex",
        "daysUntilSchoolStart",
        "daysUntilSchoolEnd",
        "schoolSeasonIndex",
        "year", "day", "quarter"
    ]

    for raw in RECOMPUTED:
        col = raw + "_norm"
        if col in encoded_df.columns:
            std = encoded_df[col].std()
            feature_stats[raw] = {
                "mean": encoded_df[col].mean(),
                "std": std if std != 0 else 1.0
            }

    #
    model = build_and_compile_model(num_numeric_features, num_items, buildParams)
    #
    history = train_model(model, encoded_df, trainParams)
    #
    predictions = run_predictions(model, encoded_df, combined_df, feature_stats)
    # 
    dataframes = {
        "predictions": predictions,
        "encoded_features": encoded_df,
        "combined_df": combined_df
    }
    save_experiment(model, history, dataframes, buildParams, trainParams, numeric_cols, item_id_to_idx, base_dir= baseDir)
################################################################################################################################

import multiprocessing as mp
def run_param_sets_multiprocess(buildParamsSets, trainParams, max_parallel, feature_stats, combined_df,encoded_df, baseDir ):
    #
    processes = []

    for buildParams in buildParamsSets:
        p = mp.Process(
            target=runExp,
            args=(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir)
        )
        p.start()
        processes.append(p)

        # limit concurrency
        if len(processes) >= max_parallel:
            for proc in processes:
                proc.join()
            processes = []

    # wait for remaining
    for proc in processes:
        proc.join()
################################################################################################################################

In [None]:
# feature_stats = {}
# RECOMPUTED = [
#     "daysSinceLastPurchase",
#     "daysUntilNextHoliday",
#     "daysSinceLastHoliday",
#     "holidayProximityIndex",
#     "daysUntilSchoolStart",
#     "daysUntilSchoolEnd",
#     "schoolSeasonIndex",
#     "year", "day", "quarter"
# ]

# for raw in RECOMPUTED:
#     col = raw + "_norm"
#     if col in encoded_df.columns:
#         std = encoded_df[col].std()
#         feature_stats[raw] = {
#             "mean": encoded_df[col].mean(),
#             "std": std if std != 0 else 1.0
#         }


In [None]:
trainParams = {
    "loss": "mse",
    "optimizer": "adam",
    "learning_rate": 0.0001,
    "metrics": ["mae"],
    "epochs": 40,
    "batch_size": 32,
    "validation_split": 0.1
}

buildParamsHiddenLayerBase = {
    "embedding_dim": 32,
    "hiddenLayers": [1],
    "output_activation": "sigmoid"
}

paramSets = HiddenLayerParamSetBuilder.BuildHiddenLayerDepthSets(buildParamsHiddenLayerBase, 64,1,20)


for eachBuildParams in paramSets:
    #
    print(f"Loop: {eachBuildParams['hiddenLayers']}")
    runExp(combined_df, encoded_df, eachBuildParams, trainParams, "exp/keras/layer_depth")


paramSets = HiddenLayerParamSetBuilder.BuildHiddenLayerDepthSets(buildParamsHiddenLayerBase, 20,2,20)
for eachBuildParams in paramSets:
    #
    print(f"Loop: {eachBuildParams['hiddenLayers']}")
    runExp(combined_df, encoded_df, eachBuildParams, trainParams, "exp/keras/layer_depth")

In [None]:
# buildParams_embeddingsTest = {
#     "embedding_dim": 1,
#     "hiddenLayers": [512],
#     "output_activation": "sigmoid"
# }

# # buildParams_embeddingsTest_relu = {
# #     "embedding_dim": 1,
# #     "hiddenLayers": [1024],
# #     "output_activation": "relu"
# # }


# trainParams = {
#     "loss": "mse",
#     "optimizer": "adam",
#     "learning_rate": 0.0001,
#     "metrics": ["mae"],
#     "epochs": 40,
#     "batch_size": 32,
#     "validation_split": 0.1
# }

# # build sets
# paramSets = BuildParamSets(buildParams_embeddingsTest, "embedding_dim", 33, 2, 64)
# # run
# run_param_sets_multiprocess(paramSets, trainParams, 4, feature_stats, combined_df,encoded_df, "exp_mp")
# #paramSets_embeddingeTest_relu = BuildParamSets(buildParams_embeddingsTest_relu, "embedding_dim", 1, 2, 32)

    

In [None]:

# tf.keras.backend.clear_session()


# # ------------------------------------------------------------
# # ENSURE itemIdx
# # ------------------------------------------------------------
# item_ids = sorted(encoded_df["itemId"].unique())
# item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
# encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
# NUM_ITEMS = len(item_ids)

# # ------------------------------------------------------------
# # FEATURES / TARGET
# # ------------------------------------------------------------
# numeric_cols = [
#     c for c in encoded_df.columns
#     if c.endswith("_norm") and c != "due_score"
# ]

# Xn = encoded_df[numeric_cols].to_numpy(np.float32)
# Xi = encoded_df["itemIdx"].to_numpy(np.int32)
# y  = encoded_df["due_score"].to_numpy(np.float32)

# # ------------------------------------------------------------
# # SPLIT
# # ------------------------------------------------------------
# Xn_tr, Xn_te, Xi_tr, Xi_te, y_tr, y_te = train_test_split(
#     Xn, Xi, y, test_size=0.2, random_state=42
# )

# # ------------------------------------------------------------
# # MODEL
# # ------------------------------------------------------------
# num_in = layers.Input(shape=(Xn_tr.shape[1],))
# itm_in = layers.Input(shape=(), dtype="int32")

# emb = layers.Embedding(NUM_ITEMS, 64)(itm_in)
# emb = layers.Flatten()(emb)

# x = layers.Concatenate()([num_in, emb])
# x = layers.Dense(4096, activation="relu")(x)
# #x = layers.Dense(2048, activation="relu")(x)
# out = layers.Dense(1, activation="sigmoid")(x)

# model = models.Model([num_in, itm_in], out)
# model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss="mse", metrics=["mae"])

# history = model.fit(
#     [Xn_tr, Xi_tr],
#     y_tr,
#     validation_split=0.1,
#     epochs=10,
#     batch_size=32,
#     verbose=1
# )

# # ------------------------------------------------------------
# # FEATURE STATS (ONLY recomputed features)
# # (NOTE: stats are for *_norm columns since inference writes *_norm)
# # ------------------------------------------------------------
# feature_stats = {}
# RECOMPUTED = [
#     "daysSinceLastPurchase",
#     "daysUntilNextHoliday",
#     "daysSinceLastHoliday",
#     "holidayProximityIndex",
#     "daysUntilSchoolStart",
#     "daysUntilSchoolEnd",
#     "schoolSeasonIndex",
#     "year", "day", "quarter",
#     "daysUntilBirthday_steve", "daysSinceBirthday_steve",
#     "daysUntilBirthday_maggie", "daysSinceBirthday_maggie",
#     "daysUntilBirthday_mil", "daysSinceBirthday_mil",
#     "daysUntilBirthday_angie", "daysSinceBirthday_angie",
# ]

# for raw in RECOMPUTED:
#     col = raw + "_norm"
#     if col in encoded_df.columns:
#         std = encoded_df[col].std()
#         feature_stats[raw] = {
#             "mean": encoded_df[col].mean(),
#             "std": std if std != 0 else 1.0
#         }

# # ------------------------------------------------------------
# # BIRTHDAYS
# # ------------------------------------------------------------
# BIRTHDAYS = { "steve":  "03-05-1980", "maggie": "03-03-2016","mil": "01-27-1962", "angie":  "08-11-1981"}
# birthdays = {k: pd.to_datetime(v) for k, v in BIRTHDAYS.items()}

# # ------------------------------------------------------------
# # PREDICT (UPDATED CALL)
# # ------------------------------------------------------------
# predictions = run_predictions(model=model, encoded_df=encoded_df, combined_df=combined_df, feature_stats=feature_stats, birthdays=birthdays, predict_date=None)

# # ------------------------------------------------------------
# # SAVE
# # ------------------------------------------------------------
# save_experiment( model=model, history=history, predictions=predictions, params={}, numeric_cols=numeric_cols, item_id_to_idx=item_id_to_idx)

# predictions.head(50)
