In [None]:

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import gc
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

In [None]:
def show_grouped(grouped, rows=10):
    # collect only the daysSinceLastPurchase_* columns
    feature_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]

    for i in range(min(rows, len(grouped))):
        print("Row:", i)
        print("Date:", grouped.iloc[i]["date"])
        print("Time:", grouped.iloc[i]["time"])
        print("Items:", grouped.iloc[i]["item"])
        print("------ daysSinceLastPurchase ------")

        for col in feature_cols:
            print(f"{col}: {grouped.iloc[i][col]}")

        print("-----------------------------------")


def show_encoded(encoded_df, rows=10):
    # Identify columns
    days_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
    weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
    item_cols = [
        c for c in encoded_df.columns 
        if c not in days_cols 
        and c not in weather_cols
        and c not in ["date", "time"]
    ]

    for i in range(min(rows, len(encoded_df))):
        print("Row:", i)
        print("Date:", encoded_df.iloc[i]["date"])
        print("Time:", encoded_df.iloc[i]["time"])

        # Show the items purchased (reverse one-hot)
        purchased_items = []
        row_vals = encoded_df.iloc[i]

        for item in item_cols:
            if row_vals[item] == 1:
                purchased_items.append(item)

        print("Items:", purchased_items)

        print("------ daysSinceLastPurchase ------")
        for col in days_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("------ weather (rolling windows) ------")
        for col in weather_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("-----------------------------------")


def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|" +
        df["qty"].astype(str) + "|" +
        df["youPay"].astype(str) + "|" +
        df["reg"].astype(str) + "|" +
        df["reportedItemsSold"].astype(str) + "|" +
        df["cashier"].astype(str) + "|" +
        df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result



def rolling_freq(df, window_days):
    out = []
    for idx, row in df.iterrows():
        item = row["item"]
        cutoff = row["date"] - pd.Timedelta(days=window_days)
        count = df[(df["item"] == item) &
                   (df["date"] > cutoff) &
                   (df["date"] < row["date"])].shape[0]
        out.append(count)
    return out

In [None]:
def daysUntilNextHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (holidays - d).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def daysSinceLastHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (d - holidays).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def holidayProximityIndex(d, scale=30):
    """
    Returns a smooth value between -1 and +1 depending on
    distance to holidays. Neural networks LOVE this.
    Negative = after holiday
    Positive = before holiday
    """
    before = daysUntilNextHoliday(d)
    after = daysSinceLastHoliday(d)

    if pd.isna(before) and pd.isna(after):
        return 0

    # choose the nearest side (before or after)
    if before <= after:
        return +max(0, (scale - before) / scale)
    else:
        return -max(0, (scale - after) / scale)
####################################################################

def daysUntilBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d <= this_year:
        return (this_year - d).days
    else:
        next_year = pd.Timestamp(d.year + 1, bday.month, bday.day)
        return (next_year - d).days
####################################################################

def daysSinceBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d >= this_year:
        return (d - this_year).days
    else:
        last_year = pd.Timestamp(d.year - 1, bday.month, bday.day)
        return (d - last_year).days
####################################################################

def tempDeviation(actualTemp, avgTemp):
    """Signed deviation (continuous). Neural-network gold."""
    return actualTemp - avgTemp
####################################################################

def humidityDeviation(actualHumidity, avgHumidity):
    return actualHumidity - avgHumidity
####################################################################

def precipDeviation(actual, avg):
    return actual - avg
####################################################################

def daysUntilSchoolStart(d):
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    if d <= start:
        return (start - d).days
    else:
        next_start = pd.Timestamp(d.year + 1, 8, 15)
        return (next_start - d).days
####################################################################

def daysUntilSchoolEnd(d):
    d = pd.to_datetime(d)
    end = pd.Timestamp(d.year, 5, 31)
    if d <= end:
        return (end - d).days
    else:
        next_end = pd.Timestamp(d.year + 1, 5, 31)
        return (next_end - d).days
####################################################################

def schoolSeasonIndex(d):
    """
    Smooth 0→1 curve inside school season.
    <0 before season, >1 after.
    Good for neural nets.
    """
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    end   = pd.Timestamp(d.year, 5, 31)

    # If date is after Dec, school season continues in Jan–May.
    if d < start:
        return -((start - d).days) / 365.0
    elif start <= d <= end:
        return (d - start).days / (end - start).days
    else:
        return (d - end).days / 365.0

####################################################################


def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)

In [None]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")

#grouped.to_csv("grouped.csv", index=False)
#grouped.info()


In [None]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            "manager": result["manager"],
            "cashier": result["cashier"],
            "item": r["item"],
            "qty": r["qty"],
            "reg": r["reg"],
            "youPay": r["youPay"],
            "reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            "qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

df_winndixie = pd.DataFrame(rows)

df_winndixie["date"] = pd.to_datetime(df_winndixie["date"])
df_winndixie["time"] = df_winndixie["time"].astype(str)
df_winndixie = remove_duplicate_receipt_files(df_winndixie)
df_winndixie = df_winndixie.sort_values(by=["date", "time"]).reset_index(drop=True)

In [None]:
### CREATE ITEM IDs
unique_items = sorted(df_winndixie["item"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
df_winndixie["itemId"] = df_winndixie["item"].map(item_to_id)
df_winndixie.reset_index(drop=True, inplace=True)
df_winndixie.info()
df_winndixie.head(100)

In [None]:

# Build full rctps
all_items = df_winndixie["itemId"].unique()
all_dates = df_winndixie["date"].unique()

full = (
    pd.MultiIndex.from_product([all_dates, all_items], names=["date", "itemId"])
    .to_frame(index=False)
)

# Merge
df_winndixie = full.merge(df_winndixie, on=["date", "itemId"], how="left")

# NOW create didBuy field safely
df_winndixie["didBuy"] = df_winndixie["qty"].notna().astype(int)


In [None]:
# 1. Build grouped table (one row per trip date)

grouped = ( df_winndixie[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)

# 2. daysSinceLastTrip
grouped["daysSinceLastTrip"] = grouped["date"].diff().dt.days.fillna(0)


# 3. Holiday / Birthday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(schoolSeasonIndex)

dt = grouped["date"]
grouped["year"]    = dt.dt.year
grouped["month"]   = dt.dt.month
grouped["day"]     = dt.dt.day
grouped["dow"]     = dt.dt.dayofweek
grouped["doy"]     = dt.dt.dayofyear
grouped["quarter"] = dt.dt.quarter



BIRTHDAYS = {
    "steve":  "03-05-1980",  # fill with your real dates
    "maggie": "03-03-2016",
    "mil":    "01-27-1962",
    "angie":  "08-11-1981",
}

grouped["daysUntilBirthday_steve"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["steve"]))
grouped["daysSinceBirthday_steve"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["steve"]))


grouped["daysUntilBirthday_maggie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysSinceBirthday_maggie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["maggie"]))

grouped["daysUntilBirthday_mil"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["mil"]))
grouped["daysSinceBirthday_mil"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["mil"]))

grouped["daysUntilBirthday_angie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["angie"]))
grouped["daysSinceBirthday_angie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["angie"]))


grouped["shopper"]    = grouped["date"].apply(schoolSeasonIndex)

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

df_winndixie = df_winndixie.merge(grouped, on="date", how="left")
df_winndixie.info()
df_winndixie.head(10)

df_winndixie.to_csv("df_merged_group_level_features.csv", index=False)

### DAYS SINCE LAST PURCHASED PER ITEM
# Must be sorted so diff() works correctly
df_winndixie = df_winndixie.sort_values(["itemId", "date"])

# Compute days since last purchase for each item
df_winndixie["daysSinceLastPurchase"] = (
    df_winndixie.groupby("itemId")["date"].diff().dt.days
)

# First-ever purchase of an item = no previous date
df_winndixie["daysSinceLastPurchase"] = df_winndixie["daysSinceLastPurchase"].fillna(999)

df_winndixie.to_csv("df_merged_daysSinceLastPurchased.csv", index=False)

In [None]:
# %%
### Frequency per item

#df_merged["freq_7"]  = rolling_freq(df_merged, 7)
#df_merged["freq_15"]  = rolling_freq(df_merged, 15)
#df_merged["freq_30"] = rolling_freq(df_merged, 30)
#df_merged["freq_90"] = rolling_freq(df_merged, 90)
#df_merged["freq_180"] = rolling_freq(df_merged, 180)
#df_merged["freq_365"] = rolling_freq(df_merged, 365)


#df_merged.to_csv("df_merged_with_freq.csv", index=False)
#df_merged.info()
#df_merged.head(500)

In [None]:
# Get all unique item names
#unique_items = grouped["item"].explode().unique()

# Map each item → index
#item_to_index = {item: idx for idx, item in enumerate(unique_items)}

#num_items = len(unique_items)
#vectors = []

# Build one-hot vector for each trip
#for item_list in grouped["item"]:
#    vector = np.zeros(num_items, dtype=np.int32)
#    for name in item_list:
#        vector[item_to_index[name]] = 1
#    vectors.append(vector)

# Convert to DataFrame (THIS was missing)
#encoded_items_df = pd.DataFrame(vectors, columns=unique_items)

In [None]:
### NORMALIZE TO ENCODED_DF
#freq_cols = [c for c in df_winndixie.columns if c.startswith("freq_")]
weather_cols = [c for c in df_winndixie.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in df_winndixie.columns if "holiday" in c.lower()]
school_cols = [c for c in df_winndixie.columns if "school" in c.lower()]
birthday_cols = [c for c in df_winndixie.columns if c.startswith("daysUntilBirthday_") or c.startswith("daysSinceBirthday_")]

daysSince_purchase_cols = ["daysSinceLastPurchase"]
daysSince_trip_cols     = ["daysSinceLastTrip"]


encoded_df = df_winndixie.copy()

#encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, birthday_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_purchase_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_trip_cols)

encoded_df.info();
encoded_df.head(100)


In [None]:
# ============================================================
# 4. SINE COSINE on CYCLICAL FEATURES
# ============================================================

encoded_df["dow_sin"] =   np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"] =   np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["doy_sin"] =   np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"] =   np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["dow","month","doy"])

## NON-CYCLIC TIME FEATURES
nonCycCols = ["year","day","quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)
#

cols_to_drop = ["source","manager","time", "cashier", "qty", "item", "reg", "youPay", "reportedItemsSold", "qtyMatchReported", "shopper" ,"date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

encoded_df.to_csv("encoded.csv", index=False)
encoded_df.info()
encoded_df.head(10)


# TRAIN / BUILD MODEL

In [None]:
# ============================================================
# FULLY SELF-CONTAINED TRAINING + SAVING METHODS (FINAL)
# ============================================================

import os, json
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split


# ============================================================
# TRAINING PROGRESS CALLBACK
# ============================================================
class EpochStatus(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(
            f"Epoch {epoch+1}: "
            f"loss={logs.get('loss'):.4f}  "
            f"val_loss={logs.get('val_loss'):.4f}  "
            f"acc={logs.get('accuracy'):.4f}"
        )

# ============================================================
# SAVE EXPERIMENT (MODEL, METRICS, HYPERPARAMS)
# ============================================================
def save_experiment(model, history, HP):

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    name = f"exp_e{HP['epochs']}_b{HP['batch_size']}_lr{HP['learning_rate']}"
    exp_dir = os.path.join("experiments", f"{timestamp}_{name}")
    os.makedirs(exp_dir, exist_ok=True)

    # Save hyperparameters
    with open(os.path.join(exp_dir, "hyperparams.json"), "w") as f:
        json.dump(HP, f, indent=4)

    # Save metrics CSV
    pd.DataFrame(history.history).to_csv(
        os.path.join(exp_dir, "training_metrics.csv"), index=False
    )

    # Save model architecture
    with open(os.path.join(exp_dir, "model_arch.json"), "w") as f:
        f.write(model.to_json())

    # Save weights
    model.save_weights(os.path.join(exp_dir, "model.weights.h5"))

    print(f"\nSaved experiment to: {exp_dir}")

    return exp_dir
##################################################################################

def get_last_trip_weather_norm(df_raw, encoded_df):
    """
    Returns a dict of { weather_norm_col: value } from the last trip.
    """
    last_trip_date = df_raw["date"].max()
    mask = df_raw["date"] == last_trip_date
    idx = df_raw.index[mask][0]

    weather_cols = [
        "temp_5day_avg_norm",
        "feelsLike_5day_avg_norm",
        "dew_5day_avg_norm",
        "humidity_5day_avg_norm",
        "precip_5day_avg_norm",
    ]

    weather = {}
    for col in weather_cols:
        weather[col] = encoded_df.loc[idx, col]

    return weather, last_trip_date
##################################################################################


def build_today_trip_features(df_raw, encoded_df):
    """
    Builds all trip-level features for *today*, normalized,
    using last trip for weather and daysSinceLastTrip.
    """
    today = pd.Timestamp.today().normalize()

    weather_norm, last_trip_date = get_last_trip_weather_norm(df_raw, encoded_df)

    # --- raw trip-level values for today ---
    days_since_last_trip_raw = (today - last_trip_date).days

    d_unext  = daysUntilNextHoliday(today)
    d_slast  = daysSinceLastHoliday(today)
    h_index  = holidayProximityIndex(today)

    d_ss     = daysUntilSchoolStart(today)
    d_se     = daysUntilSchoolEnd(today)
    s_index  = schoolSeasonIndex(today)

    year_raw     = today.year
    month_raw    = today.month
    day_raw      = today.day
    dow_raw      = today.dayofweek        # 0–6
    doy_raw      = today.dayofyear        # 1–366
    quarter_raw  = (month_raw - 1) // 3 + 1

    # birthdays
    d_steve_u = daysUntilBirthday(today, BIRTHDAYS["steve"])
    d_steve_s = daysSinceBirthday(today, BIRTHDAYS["steve"])
    d_mag_u   = daysUntilBirthday(today, BIRTHDAYS["maggie"])
    d_mag_s   = daysSinceBirthday(today, BIRTHDAYS["maggie"])
    d_mil_u   = daysUntilBirthday(today, BIRTHDAYS["mil"])
    d_mil_s   = daysSinceBirthday(today, BIRTHDAYS["mil"])
    d_ang_u   = daysUntilBirthday(today, BIRTHDAYS["angie"])
    d_ang_s   = daysSinceBirthday(today, BIRTHDAYS["angie"])

    # --- normalize using training stats ---
    feats = {}

    feats["daysSinceLastTrip_norm"]        = normalize_single_value(df_raw, "daysSinceLastTrip",       days_since_last_trip_raw)
    feats["daysUntilNextHoliday_norm"]     = normalize_single_value(df_raw, "daysUntilNextHoliday",    d_unext)
    feats["daysSinceLastHoliday_norm"]     = normalize_single_value(df_raw, "daysSinceLastHoliday",    d_slast)
    feats["holidayProximityIndex_norm"]    = normalize_single_value(df_raw, "holidayProximityIndex",   h_index)

    feats["daysUntilSchoolStart_norm"]     = normalize_single_value(df_raw, "daysUntilSchoolStart",    d_ss)
    feats["daysUntilSchoolEnd_norm"]       = normalize_single_value(df_raw, "daysUntilSchoolEnd",      d_se)
    feats["schoolSeasonIndex_norm"]        = normalize_single_value(df_raw, "schoolSeasonIndex",       s_index)

    feats["year_norm"]                     = normalize_single_value(df_raw, "year",                    year_raw)
    feats["day_norm"]                      = normalize_single_value(df_raw, "day",                     day_raw)
    feats["quarter_norm"]                  = normalize_single_value(df_raw, "quarter",                 quarter_raw)

    feats["daysUntilBirthday_steve_norm"]  = normalize_single_value(df_raw, "daysUntilBirthday_steve", d_steve_u)
    feats["daysSinceBirthday_steve_norm"]  = normalize_single_value(df_raw, "daysSinceBirthday_steve", d_steve_s)
    feats["daysUntilBirthday_maggie_norm"] = normalize_single_value(df_raw, "daysUntilBirthday_maggie", d_mag_u)
    feats["daysSinceBirthday_maggie_norm"] = normalize_single_value(df_raw, "daysSinceBirthday_maggie", d_mag_s)
    feats["daysUntilBirthday_mil_norm"]    = normalize_single_value(df_raw, "daysUntilBirthday_mil",    d_mil_u)
    feats["daysSinceBirthday_mil_norm"]    = normalize_single_value(df_raw, "daysSinceBirthday_mil",    d_mil_s)
    feats["daysUntilBirthday_angie_norm"]  = normalize_single_value(df_raw, "daysUntilBirthday_angie",  d_ang_u)
    feats["daysSinceBirthday_angie_norm"]  = normalize_single_value(df_raw, "daysSinceBirthday_angie",  d_ang_s)

    # cyclical encodings for today (no normalization)
    feats["dow_sin"]   = np.sin(2 * np.pi * dow_raw / 7.0)
    feats["dow_cos"]   = np.cos(2 * np.pi * dow_raw / 7.0)
    feats["month_sin"] = np.sin(2 * np.pi * (month_raw - 1) / 12.0)
    feats["month_cos"] = np.cos(2 * np.pi * (month_raw - 1) / 12.0)
    feats["doy_sin"]   = np.sin(2 * np.pi * (doy_raw - 1) / 365.0)
    feats["doy_cos"]   = np.cos(2 * np.pi * (doy_raw - 1) / 365.0)

    # copy over last-trip normalized weather as requested
    feats.update(weather_norm)

    return feats
##################################################################################


def build_item_features_today(item_id, df_raw):
    """
    Compute daysSinceLastPurchase_norm for this item, for *today*.
    """
    today = pd.Timestamp.today().normalize()

    mask = df_raw["itemId"] == item_id
    last_purchase_date = df_raw.loc[mask, "date"].max()

    days_since_last_purchase_raw = (today - last_purchase_date).days

    feats = {}
    feats["daysSinceLastPurchase_norm"] = normalize_single_value(
        df_raw,
        "daysSinceLastPurchase",
        days_since_last_purchase_raw
    )
    return feats
##################################################################################


def build_feature_row_for_item_today(item_id, df_raw, encoded_df, trip_feats, feature_cols):
    """
    Build a 1D np.array in the exact column order that the model was trained on.
    """
    item_feats = build_item_features_today(item_id, df_raw)

    values = {}

    # identity
    values["itemId"] = item_id

    # trip-level
    values.update(trip_feats)

    # item-level
    values.update(item_feats)

    # now order according to feature_cols
    row = []
    for col in feature_cols:
        if col == "didBuy":
            continue
        row.append(values[col])
    return np.array(row, dtype=np.float32)
##################################################################################


def predict_all_items_today(model, df_raw, encoded_df):
    """
    Runs EVERY itemId through the model for *today*
    and returns ALL items with item names and probabilities,
    sorted highest → lowest.
    """

    # build itemId -> itemName lookup
    item_lookup = (
        df_raw[["itemId", "item"]]
        .drop_duplicates()
        .set_index("itemId")["item"]
        .to_dict()
    )

    # exact training feature order
    feature_cols = [c for c in encoded_df.columns if c != "didBuy"]

    # shared trip features for today
    trip_feats = build_today_trip_features(df_raw, encoded_df)

    # all unique itemIds
    item_ids = sorted(df_raw["itemId"].unique())

    X_rows = []
    items = []

    for item_id in item_ids:
        row = build_feature_row_for_item_today(
            item_id,
            df_raw,
            encoded_df,
            trip_feats,
            feature_cols
        )
        X_rows.append(row)
        items.append(item_id)

    X = np.stack(X_rows, axis=0)

    # predict
    probs = model.predict(X).reshape(-1)

    # assemble output
    result = pd.DataFrame({
        "itemId": items,
        "itemName": [item_lookup[i] for i in items],
        "prob": probs
    })

    result = result.sort_values("prob", ascending=False)

    return result
##################################################################################

def predict_next_trip(model, encoded_df, input_feature_cols, frequent_items):
    """
    Build a single input row for prediction using the most recent trip in encoded_df.
    Returns a sorted DataFrame of predicted probabilities.
    """

    # ------------------------------
    # 1. Get the most recent row (latest trip)
    # ------------------------------
    last = encoded_df.iloc[-1]

    # ------------------------------
    # 2. Build a new row using last-known feature values
    # ------------------------------
    x = {}

    for col in input_feature_cols:
        if col in encoded_df.columns:
            x[col] = last[col]
        else:
            # safety: unknown column
            x[col] = 0.0

    # Convert to model input shape
    X_input = np.array([x[col] for col in input_feature_cols], dtype=np.float32)
    X_input = X_input.reshape(1, -1)

    # ------------------------------
    # 3. Predict probabilities
    # ------------------------------
    y_pred = model.predict(X_input)[0]   # shape: (num_items,)

    # ------------------------------
    # 4. Build labeled output table
    # ------------------------------
    result = pd.DataFrame({
        "item": frequent_items,
        "probability": y_pred
    })

    # Sort highest-probability first
    result = result.sort_values(by="probability", ascending=False)

    return result
##################################################################################

def normalize_single_value(df_raw, col_name, raw_value):
    """
    Use the same mean/std logic as normalizeAndDropCols for a single scalar.
    """
    col = df_raw[col_name].replace(999, np.nan)
    mean = col.mean()
    std = col.std() or 1.0

    if pd.isna(raw_value):
        return 0.0

    return (raw_value - mean) / std
##################################################################################

def save_predictions(pred_df, experiment_name):
    out_dir = f"experiments/{experiment_name}"
    os.makedirs(out_dir, exist_ok=True)
    pred_df.to_csv(f"{out_dir}/predictions.csv", index=False)


In [None]:
hidden_sizes = [10, 20, 30, 40, 50, 75, 100, 150, 200, 250, 350, 512]

# --------------------------------
# Fixed hyperparameters
# --------------------------------
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.0001

# Training data
feature_cols = [
    c for c in encoded_df.columns
    if c != "didBuy" and np.issubdtype(encoded_df[c].dtype, np.number)
]

X = encoded_df[feature_cols].to_numpy(dtype=np.float32)
y = encoded_df["didBuy"].to_numpy(dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

# Class weights for imbalance
pos = y_train.sum()
neg = len(y_train) - pos
cw = min(50.0, neg / (pos + 1e-6))
class_weights = {0: cw}

# ===================================================================
# LOOP OVER HIDDEN SIZES
# ===================================================================
for h in hidden_sizes:

    print("\n==============================")
    print(f" RUNNING EXPERIMENT: hidden={h}")
    print("==============================")

    # ---------------------------------------------------------------
    # HP for this experiment
    # ---------------------------------------------------------------
    HP = {
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "hidden": h
    }

    # ---------------------------------------------------------------
    # Build model
    # ---------------------------------------------------------------
    model = models.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(h, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    # ---------------------------------------------------------------
    # Train
    # ---------------------------------------------------------------
    history = model.fit(
        X_train, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        class_weight=class_weights,
        verbose=0,
        callbacks=[EpochStatus()]
    )

    # ---------------------------------------------------------------
    # Save (model, history, HP)
    # ---------------------------------------------------------------
    exp_name = f"exp_hidden{h}"
    save_experiment(model=model, history=history, HP=HP)

    # ---------------------------------------------------------------
    # Run predictions on ALL items for TODAY
    # ---------------------------------------------------------------
    pred_df = predict_all_items_today(model, df_winndixie, encoded_df)

    # ---------------------------------------------------------------
    # Save predictions
    # ---------------------------------------------------------------
    save_predictions(pred_df, exp_name)

    print(f"Completed: {exp_name}")
    print(pred_df.head())
