In [1]:

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import gc
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

C:\Users\steve\OneDrive - NOLA Business IT\source\repos\grocery-ml
GPUs Available: []


In [2]:
def show_grouped(grouped, rows=10):
    # collect only the daysSinceLastPurchase_* columns
    feature_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]

    for i in range(min(rows, len(grouped))):
        print("Row:", i)
        print("Date:", grouped.iloc[i]["date"])
        print("Time:", grouped.iloc[i]["time"])
        print("Items:", grouped.iloc[i]["item"])
        print("------ daysSinceLastPurchase ------")

        for col in feature_cols:
            print(f"{col}: {grouped.iloc[i][col]}")

        print("-----------------------------------")
#################################################################

def show_encoded(encoded_df, rows=10):
    # Identify columns
    days_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
    weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
    item_cols = [
        c for c in encoded_df.columns 
        if c not in days_cols 
        and c not in weather_cols
        and c not in ["date", "time"]
    ]

    for i in range(min(rows, len(encoded_df))):
        print("Row:", i)
        print("Date:", encoded_df.iloc[i]["date"])
        print("Time:", encoded_df.iloc[i]["time"])

        # Show the items purchased (reverse one-hot)
        purchased_items = []
        row_vals = encoded_df.iloc[i]

        for item in item_cols:
            if row_vals[item] == 1:
                purchased_items.append(item)

        print("Items:", purchased_items)

        print("------ daysSinceLastPurchase ------")
        for col in days_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("------ weather (rolling windows) ------")
        for col in weather_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("-----------------------------------")

#################################################################
def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|" 
        #df["qty"].astype(str) + "|" +
        #df["youPay"].astype(str) + "|" +
        #df["reg"].astype(str) + "|" +
        #df["reportedItemsSold"].astype(str) + "|" +
        #df["cashier"].astype(str) + "|" +
        #df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result
#################################################################

def rolling_freq(df, window_days):
    out = []
    for idx, row in df.iterrows():
        item = row["item"]
        cutoff = row["date"] - pd.Timedelta(days=window_days)
        count = df[(df["item"] == item) &
                   (df["date"] > cutoff) &
                   (df["date"] < row["date"])].shape[0]
        out.append(count)
    return out

In [3]:
def daysUntilNextHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (holidays - d).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def daysSinceLastHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (d - holidays).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def holidayProximityIndex(d, scale=30):
    """
    Returns a smooth value between -1 and +1 depending on
    distance to holidays. Neural networks LOVE this.
    Negative = after holiday
    Positive = before holiday
    """
    before = daysUntilNextHoliday(d)
    after = daysSinceLastHoliday(d)

    if pd.isna(before) and pd.isna(after):
        return 0

    # choose the nearest side (before or after)
    if before <= after:
        return +max(0, (scale - before) / scale)
    else:
        return -max(0, (scale - after) / scale)
####################################################################

def daysUntilBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d <= this_year:
        return (this_year - d).days
    else:
        next_year = pd.Timestamp(d.year + 1, bday.month, bday.day)
        return (next_year - d).days
####################################################################

def daysSinceBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d >= this_year:
        return (d - this_year).days
    else:
        last_year = pd.Timestamp(d.year - 1, bday.month, bday.day)
        return (d - last_year).days
####################################################################

def tempDeviation(actualTemp, avgTemp):
    """Signed deviation (continuous). Neural-network gold."""
    return actualTemp - avgTemp
####################################################################

def humidityDeviation(actualHumidity, avgHumidity):
    return actualHumidity - avgHumidity
####################################################################

def precipDeviation(actual, avg):
    return actual - avg
####################################################################

def daysUntilSchoolStart(d):
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    if d <= start:
        return (start - d).days
    else:
        next_start = pd.Timestamp(d.year + 1, 8, 15)
        return (next_start - d).days
####################################################################

def daysUntilSchoolEnd(d):
    d = pd.to_datetime(d)
    end = pd.Timestamp(d.year, 5, 31)
    if d <= end:
        return (end - d).days
    else:
        next_end = pd.Timestamp(d.year + 1, 5, 31)
        return (next_end - d).days
####################################################################

def schoolSeasonIndex(d):
    """
    Smooth 0→1 curve inside school season.
    <0 before season, >1 after.
    Good for neural nets.
    """
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    end   = pd.Timestamp(d.year, 5, 31)

    # If date is after Dec, school season continues in Jan–May.
    if d < start:
        return -((start - d).days) / 365.0
    elif start <= d <= end:
        return (d - start).days / (end - start).days
    else:
        return (d - end).days / 365.0

####################################################################


def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)



def canonicalize_items(df, patterns, canonical_name):
    """
    For each pattern in `patterns`, find rows where `item` contains the pattern
    and replace df['item'] with `canonical_name`.
    """
    for p in patterns:
        mask = df["item"].str.contains(p, case=False, na=False)
        df.loc[mask, "item"] = canonical_name


In [4]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")

#grouped.to_csv("grouped.csv", index=False)
#grouped.info()


In [5]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"]
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

df_winndixie = pd.DataFrame(rows)

df_winndixie["date"] = pd.to_datetime(df_winndixie["date"])
df_winndixie["time"] = df_winndixie["time"].astype(str)
df_winndixie = remove_duplicate_receipt_files(df_winndixie)
df_winndixie = df_winndixie.sort_values(by=["date", "time"]).reset_index(drop=True)

DUP: 2025-02-19 00:00:00 7:20 PM → keep IMG_9734.txt ← drop IMG_9735.txt
DUP: 2025-04-08 00:00:00 11:50 AM → keep IMG_9723.txt ← drop IMG_9724.txt
DUP: 2025-05-15 00:00:00 8:19 PM → keep IMG_9713.txt ← drop IMG_9714.txt
DUP: 2025-08-02 00:00:00 10:29 PM → keep IMG_9693.txt ← drop IMG_9694.txt
DUP: 2025-10-07 00:00:00 6:06 PM → keep IMG_0017.txt ← drop IMG_9669.txt
DUP: 2025-10-14 00:00:00 4:06 PM → keep IMG_0015.txt ← drop IMG_9667.txt
DUP: 2025-10-14 00:00:00 6:08 PM → keep IMG_0014.txt ← drop IMG_9666.txt
DUP: 2025-10-17 00:00:00 9:18 PM → keep IMG_0013.txt ← drop IMG_9664.txt
DUP: 2025-10-17 00:00:00 9:23 PM → keep IMG_0012.txt ← drop IMG_9663.txt


In [6]:
milk_patterns = ["know-and-love-milk", "kandl-milk", "prairie-farm-milk","kleinpeter-milk", "kl-milk"]
canonicalize_items(df_winndixie, milk_patterns, "milk")


bread_patterns = ["bunny-bread","se-grocers-bread","seg-sandwich-bread", "seg-white-bread"]
canonicalize_items(df_winndixie, bread_patterns, "bread")

cheese_patterns = ["dandw-cheese", "kraft-cheese", "se-grocers-cheese", "know-and-love-cheese"]
canonicalize_items(df_winndixie, cheese_patterns, "cheese")

mayo_patterns = ["blue-plate-mayo", "blue-plate-mynnase"]
canonicalize_items(df_winndixie, mayo_patterns, "mayo")

chicken_patterns = ["chicken-cutlet", "chicken-leg", "chicken-thigh", "chicken-thighs"]
canonicalize_items(df_winndixie, chicken_patterns, "chicken")

yogurt_patterns = ["chobani-yogrt-flip", "chobani-yogurt"]
canonicalize_items(df_winndixie, yogurt_patterns, "yogurt")


coke_patterns = ["coca-cola", "coca-cola-cola", "cocacola-soda"]
canonicalize_items(df_winndixie, coke_patterns, "coke")

hugbi_patterns = ["hugbi-pies", "-hugbi-pies"]
canonicalize_items(df_winndixie, hugbi_patterns, "hugbi-pies")

minute_maid_patterns = ["minute-maid-drink", "minute-maid-drinks", "minute-maid-lmnade"]
canonicalize_items(df_winndixie, minute_maid_patterns, "minute-maid-drink")



In [7]:
### CREATE ITEM IDs
unique_items = sorted(df_winndixie["item"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
df_winndixie["itemId"] = df_winndixie["item"].map(item_to_id)
df_winndixie.reset_index(drop=True, inplace=True)
df_winndixie.info()
df_winndixie.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856 entries, 0 to 855
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   source  856 non-null    object        
 1   date    856 non-null    datetime64[ns]
 2   time    856 non-null    object        
 3   item    856 non-null    object        
 4   itemId  856 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 33.6+ KB


Unnamed: 0,source,date,time,item,itemId
0,IMG_9764.txt,2024-11-15,8:13 PM,spaghettios-pasta,310
1,IMG_9764.txt,2024-11-15,8:13 PM,mars-chocolate,173
2,IMG_9764.txt,2024-11-15,8:13 PM,mandms-candies,172
3,IMG_9764.txt,2024-11-15,8:13 PM,mandarins,171
4,IMG_9764.txt,2024-11-15,8:13 PM,sparkling-ice-wtr,311
5,IMG_9764.txt,2024-11-15,8:13 PM,sparkling-ice-wtr,311
6,IMG_9764.txt,2024-11-15,8:13 PM,coke,66
7,IMG_9763.txt,2024-11-22,9:15 PM,hersheys-syrup,117
8,IMG_9763.txt,2024-11-22,9:15 PM,mayfield-icecream,176
9,IMG_9762.txt,2024-11-24,3:40 PM,bread,40


In [8]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
df_winndixie["didBuy"] = 1

# 2. Build complete grid
all_items = df_winndixie["itemId"].unique()
all_dates = df_winndixie["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    df_winndixie[["date", "itemId", "item", "source", "time", "didBuy"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy"] = df_full["didBuy"].fillna(0).astype(int)

# 5. NOW REPLACE df_winndixie with df_full
df_winndixie = df_full.copy()

df_winndixie.to_csv("df_fullreceipts.csv", index=False)


In [9]:
# 1. Build grouped table (one row per trip date)

grouped = ( df_winndixie[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)

# 2. daysSinceLastTrip
grouped["daysSinceLastTrip"] = grouped["date"].diff().dt.days.fillna(0)

# 3. Holiday / Birthday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(schoolSeasonIndex)

dt = grouped["date"]
grouped["year"]    = dt.dt.year
grouped["month"]   = dt.dt.month
grouped["day"]     = dt.dt.day
grouped["dow"]     = dt.dt.dayofweek
grouped["doy"]     = dt.dt.dayofyear
grouped["quarter"] = dt.dt.quarter



BIRTHDAYS = {
    "steve":  "03-05-1980",  # fill with your real dates
    "maggie": "03-03-2016",
    "mil":    "01-27-1962",
    "angie":  "08-11-1981",
}

grouped["daysUntilBirthday_steve"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["steve"]))
grouped["daysSinceBirthday_steve"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["steve"]))


grouped["daysUntilBirthday_maggie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysSinceBirthday_maggie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["maggie"]))

grouped["daysUntilBirthday_mil"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["mil"]))
grouped["daysSinceBirthday_mil"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["mil"]))

grouped["daysUntilBirthday_angie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["angie"]))
grouped["daysSinceBirthday_angie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["angie"]))




# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

df_winndixie = df_winndixie.merge(grouped, on="date", how="left")
df_winndixie.info()
df_winndixie.head(10)

df_winndixie.to_csv("df_merged_group_level_features.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34687 entries, 0 to 34686
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      34687 non-null  datetime64[ns]
 1   itemId                    34687 non-null  int64         
 2   item                      856 non-null    object        
 3   source                    856 non-null    object        
 4   time                      856 non-null    object        
 5   didBuy                    34687 non-null  int32         
 6   daysSinceLastTrip         34687 non-null  float64       
 7   daysUntilNextHoliday      34687 non-null  int64         
 8   daysSinceLastHoliday      34687 non-null  int64         
 9   holidayProximityIndex     34687 non-null  float64       
 10  daysUntilSchoolStart      34687 non-null  int64         
 11  daysUntilSchoolEnd        34687 non-null  int64         
 12  schoolSeasonIndex 

In [10]:
# ================================================
# FREQUENCY WINDOWS (7, 15, 30, 90, 365)
# Matching daysSinceLastPurchase row-by-row logic
# ================================================

freq_windows = [7, 15, 30, 90, 365]

for w in freq_windows:
    df_winndixie[f"freq_{w}"] = np.nan

def fill_freq(group):
    group = group.copy()

    # a rolling list of purchase dates for this item
    history = []

    # get col indices for speed
    col_date = group.columns.get_loc("date")
    col_buy = group.columns.get_loc("didBuy")

    col_freq = {w: group.columns.get_loc(f"freq_{w}") for w in freq_windows}

    for i in range(len(group)):
        cur_date = group.iat[i, col_date]

        # If purchase: add to history
        if group.iat[i, col_buy] == 1:
            history.append(cur_date)

        # Remove old purchases outside window
        for w in freq_windows:
            cutoff = cur_date - pd.Timedelta(days=w)
            history_w = [d for d in history if d >= cutoff]
            group.iat[i, col_freq[w]] = len(history_w)

        # Update history after cleanup
        history = history_w if len(freq_windows) == 1 else history

    return group

df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_freq)

  df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_freq)


In [11]:
# ============================================================
# INCREASING DAILY daysSinceLastPurchase (resets on purchase)
# ============================================================

df_winndixie = df_winndixie.sort_values(["itemId", "date"]).reset_index(drop=True)

# Start with NaN everywhere
df_winndixie["daysSinceLastPurchase"] = np.nan

# Set 0 on purchase days
df_winndixie.loc[df_winndixie["didBuy"] == 1, "daysSinceLastPurchase"] = 0

def fill_item(group):
    group = group.copy()
    # iterate row-by-row using positional index
    for i in range(1, len(group)):
        if pd.isna(group.iat[i, group.columns.get_loc("daysSinceLastPurchase")]):
            prev_val = group.iat[i-1, group.columns.get_loc("daysSinceLastPurchase")]
            trip_gap = group.iat[i, group.columns.get_loc("daysSinceLastTrip")]
            group.iat[i, group.columns.get_loc("daysSinceLastPurchase")] = prev_val + trip_gap
    return group

df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_item)

# Items with no purchase history get 999
df_winndixie["daysSinceLastPurchase"] = df_winndixie["daysSinceLastPurchase"].fillna(999)
df_winndixie.to_csv("daysSinceLastPurchase.csv", index=False)

  df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_item)


In [12]:
### NORMALIZE TO ENCODED_DF
freq_cols = [c for c in df_winndixie.columns if c.startswith("freq_")]
weather_cols = [c for c in df_winndixie.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in df_winndixie.columns if "holiday" in c.lower()]
school_cols = [c for c in df_winndixie.columns if "school" in c.lower()]
birthday_cols = [c for c in df_winndixie.columns if c.startswith("daysUntilBirthday_") or c.startswith("daysSinceBirthday_")]

daysSince_purchase_cols = ["daysSinceLastPurchase"]
daysSince_trip_cols     = ["daysSinceLastTrip"]

encoded_df = df_winndixie.copy()

encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, birthday_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_purchase_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_trip_cols)

encoded_df.info();
encoded_df.head(100)


<class 'pandas.core.frame.DataFrame'>
Index: 34687 entries, 0 to 34686
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   date                           34687 non-null  datetime64[ns]
 1   itemId                         34687 non-null  int64         
 2   item                           856 non-null    object        
 3   source                         856 non-null    object        
 4   time                           856 non-null    object        
 5   didBuy                         34687 non-null  int32         
 6   year                           34687 non-null  int32         
 7   month                          34687 non-null  int32         
 8   day                            34687 non-null  int32         
 9   dow                            34687 non-null  int32         
 10  doy                            34687 non-null  int32         
 11  quarter             

Unnamed: 0,date,itemId,item,source,time,didBuy,year,month,day,dow,doy,quarter,freq_7_norm,freq_15_norm,freq_30_norm,freq_90_norm,freq_365_norm,temp_5day_avg_norm,feelsLike_5day_avg_norm,dew_5day_avg_norm,humidity_5day_avg_norm,precip_5day_avg_norm,daysUntilNextHoliday_norm,daysSinceLastHoliday_norm,holidayProximityIndex_norm,daysUntilSchoolStart_norm,daysUntilSchoolEnd_norm,schoolSeasonIndex_norm,daysUntilBirthday_steve_norm,daysSinceBirthday_steve_norm,daysUntilBirthday_maggie_norm,daysSinceBirthday_maggie_norm,daysUntilBirthday_mil_norm,daysSinceBirthday_mil_norm,daysUntilBirthday_angie_norm,daysSinceBirthday_angie_norm,daysSinceLastPurchase_norm,daysSinceLastTrip_norm
0,2024-11-15,0,,,,0,2024,11,15,4,320,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.002059,-0.143288,0.190016,0.708576,-0.201862,-0.467567,-0.872987,-1.241597,0.707223,-0.012597,1.168973,-0.596128,0.596128,-0.63145,0.63145,-0.924515,0.962651,0.644747,-0.613726,0.0,-1.249006
1,2024-11-22,0,,,,0,2024,11,22,4,327,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.474907,-0.513929,-0.765328,-1.205962,1.160207,-0.805768,-0.525395,1.134769,0.643611,-0.085327,1.219174,-0.668556,0.668556,-0.703474,0.703474,-0.992493,1.030183,0.580557,-0.549165,0.0,1.006627
2,2024-11-24,0,,,,0,2024,11,24,6,329,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.994375,-0.933815,-1.334706,-1.55406,-0.775729,-0.902397,-0.426083,1.229824,0.625435,-0.106107,1.233517,-0.68925,0.68925,-0.724052,0.724052,-1.011915,1.049478,0.562217,-0.530719,0.0,-0.604539
3,2024-11-27,0,,,,0,2024,11,27,2,332,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.623089,-0.642228,-0.269315,1.230722,-0.694736,-1.047341,-0.277115,1.372406,0.598173,-0.137278,1.255032,-0.720291,0.720291,-0.754919,0.754919,-1.041049,1.078421,0.534707,-0.50305,0.0,-0.282306
4,2024-12-02,0,,,,0,2024,12,2,0,337,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-1.400625,-1.283721,-1.601054,-1.274068,-0.763876,0.015578,-0.872987,-1.241597,0.552735,-0.189228,1.29089,-0.772026,0.772026,-0.806365,0.806365,-1.089605,1.126658,0.488856,-0.456936,0.0,0.362161
5,2024-12-04,0,,,,0,2024,12,4,2,339,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-1.605414,-1.450898,-1.712697,-0.971375,-0.764864,-0.081051,-0.773675,-1.146542,0.53456,-0.210008,1.305234,-0.79272,0.79272,-0.826943,0.826943,-1.109027,1.145953,0.470516,-0.43849,0.0,-0.604539
6,2024-12-07,0,,,,0,2024,12,7,5,342,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-1.5871,-1.479409,-1.624977,-0.736787,0.487568,-0.225995,-0.624707,-1.00396,0.507297,-0.241178,1.326749,-0.823761,0.823761,-0.857811,0.857811,-1.138161,1.174896,0.443006,-0.410821,0.0,-0.282306
7,2024-12-12,0,,,,0,2024,12,12,3,347,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.916121,-0.904009,-0.720671,0.524437,0.090503,-0.467567,-0.376427,0.802078,0.46186,-0.293128,1.362607,-0.875495,0.875495,-0.909256,0.909256,-1.186717,1.223133,0.397156,-0.364706,0.0,0.362161
8,2024-12-28,0,,,,0,2024,12,28,5,363,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.531516,-0.561879,0.060829,2.315375,2.20818,-0.902397,-0.922643,-1.289124,0.316459,-0.459369,1.477353,-1.041046,1.041046,-1.073882,1.073882,-1.342096,1.377494,0.250436,-0.217138,0.0,3.906727
9,2024-12-29,0,,,,0,2024,12,29,6,364,4,-0.238218,-0.289156,-0.350584,-0.444741,-0.469532,-0.521526,-0.554103,0.086348,2.388526,2.257566,-0.950712,-0.872987,1.277351,0.307371,-0.469759,1.484525,-1.051393,1.051393,-1.084172,1.084172,-1.351807,1.387141,0.241265,-0.207915,0.0,-0.926773


In [13]:
# ============================================================
# 4. SINE COSINE on CYCLICAL FEATURES
# ============================================================

encoded_df["dow_sin"] =   np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"] =   np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["doy_sin"] =   np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"] =   np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["dow","month","doy"])

## NON-CYCLIC TIME FEATURES
nonCycCols = ["year","day","quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)
#

cols_to_drop = ["source","item",  "date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

item_dummies = pd.get_dummies(encoded_df["itemId"], prefix="item")
encoded_df = pd.concat([encoded_df.drop(columns=["itemId"]), item_dummies], axis=1)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)
encoded_df.to_csv("encoded.csv", index=False)
encoded_df.info()





<class 'pandas.core.frame.DataFrame'>
Index: 34687 entries, 0 to 34686
Columns: 393 entries, didBuy to item_356
dtypes: bool(357), float64(35), int32(1)
memory usage: 21.5 MB


# TRAIN / BUILD MODEL

In [14]:
# ============================================================
# CLEAN PIPELINE — NO ONE-HOT — ITEMID AS NUMERIC FEATURE
# ============================================================

import os, json
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split


# ============================================================
# CALLBACK FOR TRAINING PROGRESS
# ============================================================
class EpochStatus(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(
            f"Epoch {epoch+1}: "
            f"loss={logs.get('loss'):.4f}  "
            f"val_loss={logs.get('val_loss'):.4f}  "
            f"acc={logs.get('accuracy'):.4f}"
        )


# ============================================================
# SAVE EXPERIMENT RESULTS
# ============================================================
def save_experiment(model, history, HP):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    name = f"exp_e{HP['epochs']}_b{HP['batch_size']}_lr{HP['learning_rate']}"
    exp_dir = os.path.join("experiments", f"{timestamp}_{name}")
    os.makedirs(exp_dir, exist_ok=True)

    with open(os.path.join(exp_dir, "hyperparams.json"), "w") as f:
        json.dump(HP, f, indent=4)

    pd.DataFrame(history.history).to_csv(
        os.path.join(exp_dir, "training_metrics.csv"), index=False
    )

    with open(os.path.join(exp_dir, "model_arch.json"), "w") as f:
        f.write(model.to_json())

    model.save_weights(os.path.join(exp_dir, "model.weights.h5"))

    print(f"\nSaved experiment to: {exp_dir}")
    return exp_dir


# ============================================================
# SINGLE-VALUE NORMALIZATION
# ============================================================
def normalize_single_value(df_raw, col_name, raw_value):
    col = df_raw[col_name].replace(999, np.nan)
    mean = col.mean()
    std = col.std() or 1.0
    if pd.isna(raw_value):
        return 0.0
    return (raw_value - mean) / std


# ============================================================
# TRIP FEATURE BUILDER (UNCHANGED)
# ============================================================
def build_today_trip_features(df_raw, encoded_df):
    today = pd.Timestamp.today().normalize()
    last_trip_date = df_raw["date"].max()
    idx = df_raw.index[df_raw["date"] == last_trip_date][0]

    weather_cols = [
        "temp_5day_avg_norm",
        "feelsLike_5day_avg_norm",
        "dew_5day_avg_norm",
        "humidity_5day_avg_norm",
        "precip_5day_avg_norm",
    ]
    weather_norm = {c: encoded_df.loc[idx, c] for c in weather_cols}

    days_since_last_trip_raw = (today - last_trip_date).days

    feats = {}
    feats["daysSinceLastTrip_norm"] = normalize_single_value(df_raw, "daysSinceLastTrip", days_since_last_trip_raw)

    # Time-based indexes
    d_unext = daysUntilNextHoliday(today)
    d_slast = daysSinceLastHoliday(today)
    h_index = holidayProximityIndex(today)
    feats["daysUntilNextHoliday_norm"] = normalize_single_value(df_raw, "daysUntilNextHoliday", d_unext)
    feats["daysSinceLastHoliday_norm"] = normalize_single_value(df_raw, "daysSinceLastHoliday", d_slast)
    feats["holidayProximityIndex_norm"] = normalize_single_value(df_raw, "holidayProximityIndex", h_index)

    # School season
    d_ss = daysUntilSchoolStart(today)
    d_se = daysUntilSchoolEnd(today)
    s_index = schoolSeasonIndex(today)
    feats["daysUntilSchoolStart_norm"] = normalize_single_value(df_raw, "daysUntilSchoolStart", d_ss)
    feats["daysUntilSchoolEnd_norm"] = normalize_single_value(df_raw, "daysUntilSchoolEnd", d_se)
    feats["schoolSeasonIndex_norm"] = normalize_single_value(df_raw, "schoolSeasonIndex", s_index)

    # Date components
    year_raw = today.year
    month_raw = today.month
    day_raw = today.day
    dow_raw = today.dayofweek
    doy_raw = today.dayofyear
    quarter_raw = (month_raw - 1)//3 + 1
    feats["year_norm"] = normalize_single_value(df_raw, "year", year_raw)
    feats["day_norm"] = normalize_single_value(df_raw, "day", day_raw)
    feats["quarter_norm"] = normalize_single_value(df_raw, "quarter", quarter_raw)

    # Cycles
    feats["dow_sin"] = np.sin(2*np.pi*dow_raw/7.0)
    feats["dow_cos"] = np.cos(2*np.pi*dow_raw/7.0)
    feats["month_sin"] = np.sin(2*np.pi*(month_raw-1)/12.0)
    feats["month_cos"] = np.cos(2*np.pi*(month_raw-1)/12.0)
    feats["doy_sin"] = np.sin(2*np.pi*(doy_raw-1)/365.0)
    feats["doy_cos"] = np.cos(2*np.pi*(doy_raw-1)/365.0)

    feats.update(weather_norm)
    return feats


# ============================================================
# ITEM FEATURES (freq + days since last purchase)
# ============================================================
def build_item_features_today(item_id, df_raw, encoded_df):
    mask = df_raw["itemId"] == item_id
    feats = {}

    if not mask.any():
        for w in [7, 15, 30, 90, 365]:
            feats[f"freq_{w}"] = 0.0
        feats["daysSinceLastPurchase_norm"] = 0.0
        return feats

    idx_last = df_raw.loc[mask, "date"].idxmax()
    row = encoded_df.loc[idx_last]

    for w in [7, 15, 30, 90, 365]:
        feats[f"freq_{w}"] = float(row[f"freq_{w}"])

    feats["daysSinceLastPurchase_norm"] = float(row["daysSinceLastPurchase_norm"])
    return feats


# ============================================================
# BUILD ROW FOR A SINGLE ITEM (NO ONE-HOT!)
# ============================================================
def build_feature_row_for_item_today(item_id, df_raw, encoded_df, trip_feats, feature_cols):
    values = {}

    # itemId STAYS NUMERIC
    values["itemId"] = float(item_id)

    # trip-level
    values.update(trip_feats)

    # item-level
    item_feats = build_item_features_today(item_id, df_raw, encoded_df)
    values.update(item_feats)

    # construct final row in exact column order
    return np.array([values.get(col, 0.0) for col in feature_cols], dtype=np.float32)


# ============================================================
# PREDICT FOR ALL ITEMS TODAY
# ============================================================
def predict_all_items_today(model, df_raw, encoded_df, feature_cols):
    item_lookup = df_raw[["itemId", "item"]].drop_duplicates().set_index("itemId")["item"].to_dict()

    trip_feats = build_today_trip_features(df_raw, encoded_df)
    item_ids = sorted(df_raw["itemId"].unique())

    X_rows = []
    for item_id in item_ids:
        row = build_feature_row_for_item_today(item_id, df_raw, encoded_df, trip_feats, feature_cols)
        X_rows.append(row)

    X = np.stack(X_rows)
    probs = model.predict(X).reshape(-1)

    return pd.DataFrame({
        "itemId": item_ids,
        "itemName": [item_lookup[i] for i in item_ids],
        "prob": probs
    }).sort_values("prob", ascending=False)


# ============================================================
# TRAINING PIPELINE — NO ONE HOT
# ============================================================
hidden_sizes = [10, 20, 50, 100, 200, 350, 512]

EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.0001

# all numeric columns except didBuy are features
feature_cols = [
    c for c in encoded_df.columns
    if c != "didBuy" and np.issubdtype(encoded_df[c].dtype, np.number)
]

X = encoded_df[feature_cols].to_numpy(np.float32)
y = encoded_df["didBuy"].to_numpy(np.float32)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

pos = y_train.sum()
neg = len(y_train) - pos
cw = min(50.0, neg/(pos+1e-6))
class_weights = {0: cw}

for h in hidden_sizes:
    print(f"\n=== Running model with hidden={h} ===")

    HP = {"epochs": EPOCHS, "batch_size": BATCH_SIZE, "learning_rate": LEARNING_RATE, "hidden": h}

    model = models.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(h, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    history = model.fit(
        X_train, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        class_weight=class_weights,
        verbose=0,
        callbacks=[EpochStatus()]
    )

    exp_name = f"exp_hidden{h}"
    save_experiment(model, history, HP)

    preds = predict_all_items_today(model, df_winndixie, encoded_df, feature_cols)
    preds.to_csv(f"experiments/{exp_name}/predictions.csv", index=False)

    print(preds.head())



=== Running model with hidden=10 ===


InvalidArgumentError: Graph execution error:

indices[1] = 1 is not in [0, 1)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_914]

In [None]:
# ===================================================================
# FINAL TRAINING CELL — itemId via Embedding, all other numeric cols
# ===================================================================

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# ---------------------------------------------
# Fix: Separate numeric context features
# ---------------------------------------------
feature_cols = [
    c for c in encoded_df.columns
    if c not in ["didBuy", "itemId"] and np.issubdtype(encoded_df[c].dtype, np.number)
]

X_numeric = encoded_df[feature_cols].to_numpy(dtype=np.float32)
X_itemId  = encoded_df["itemId"].astype("int32").to_numpy()   # IMPORTANT
y         = encoded_df["didBuy"].astype("float32").to_numpy()

# ---------------------------------------------
# Train/val split
# ---------------------------------------------
Xn_train, Xn_test, Xi_train, Xi_test, y_train, y_test = train_test_split(
    X_numeric, X_itemId, y, test_size=0.2, random_state=42, shuffle=True
)

# ---------------------------------------------
# Class weights
# ---------------------------------------------
pos = y_train.sum()
neg = len(y_train) - pos
cw = min(50.0, neg / (pos + 1e-6))
class_weights = {0: cw}

# ---------------------------------------------
# Model hyperparameters
# ---------------------------------------------
hidden_sizes = [10, 20, 30, 40, 50, 75]
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.0001

# Number of unique items (from your dataset)
NUM_ITEMS = 379
EMBED_DIM = 32  # can tune later

# ===================================================================
# LOOP OVER HIDDEN SIZES
# ===================================================================
for h in hidden_sizes:

    print("\n==============================")
    print(f" RUNNING EXPERIMENT: hidden={h}")
    print("==============================")

    # ---------------------------------------------------------------
    # Model definition
    # ---------------------------------------------------------------
    numeric_input = layers.Input(shape=(Xn_train.shape[1],), name="numeric_input")

    item_input = layers.Input(shape=(), dtype="int32", name="item_input")
    item_embed = layers.Embedding(input_dim=NUM_ITEMS, output_dim=EMBED_DIM)(item_input)
    item_embed = layers.Flatten()(item_embed)

    combined = layers.Concatenate()([numeric_input, item_embed])

    x = layers.Dense(h, activation="relu")(combined)
    output = layers.Dense(1, activation="sigmoid")(x)

    model = models.Model(inputs=[numeric_input, item_input], outputs=output)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    # ---------------------------------------------------------------
    # Train
    # ---------------------------------------------------------------
    history = model.fit(
        {"numeric_input": Xn_train, "item_input": Xi_train},
        y_train,
        validation_split=0.1,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=0,
        class_weight=class_weights,
        callbacks=[EpochStatus()]
    )

    # Save experiment artifacts
    exp_name = f"exp_hidden{h}"
    save_experiment(model=model, history=history, HP={"hidden": h})

    print(f"Completed: {exp_name}")
