In [1]:

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import gc
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

C:\Users\steve\OneDrive - NOLA Business IT\source\repos\grocery-ml
GPUs Available: []


In [2]:
def show_grouped(grouped, rows=10):
    # collect only the daysSinceLastPurchase_* columns
    feature_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]

    for i in range(min(rows, len(grouped))):
        print("Row:", i)
        print("Date:", grouped.iloc[i]["date"])
        print("Time:", grouped.iloc[i]["time"])
        print("Items:", grouped.iloc[i]["itemCanonical"])
        print("------ daysSinceLastPurchase ------")

        for col in feature_cols:
            print(f"{col}: {grouped.iloc[i][col]}")

        print("-----------------------------------")

####################################################################
def show_encoded(encoded_df, rows=10):
    # Identify columns
    days_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
    weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
    item_cols = [
        c for c in encoded_df.columns 
        if c not in days_cols 
        and c not in weather_cols
        and c not in ["date", "time"]
    ]

    for i in range(min(rows, len(encoded_df))):
        print("Row:", i)
        print("Date:", encoded_df.iloc[i]["date"])
        print("Time:", encoded_df.iloc[i]["time"])

        # Show the items purchased (reverse one-hot)
        purchased_items = []
        row_vals = encoded_df.iloc[i]

        for item in item_cols:
            if row_vals[item] == 1:
                purchased_items.append(item)

        print("Items:", purchased_items)

        print("------ daysSinceLastPurchase ------")
        for col in days_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("------ weather (rolling windows) ------")
        for col in weather_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("-----------------------------------")
####################################################################

def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|"
        #df["qty"].astype(str) + "|" +
        #df["youPay"].astype(str) + "|" +
        #df["reg"].astype(str) + "|" +
        #df["reportedItemsSold"].astype(str) + "|" +
        #df["cashier"].astype(str) + "|" +
        #df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result
####################################################################


def rolling_freq(df, window_days):
    # Pre-group all purchase dates by itemId
    item_dates = {}
    for idx, row in df.iterrows():
        item = row["itemId"]
        date = row["date"]
        if item not in item_dates:
            item_dates[item] = []
        item_dates[item].append(date)

    # Sort date lists
    for item in item_dates:
        item_dates[item].sort()

    out = []
    delta = pd.Timedelta(days=window_days)

    # For each row, count dates in the window
    for idx, row in df.iterrows():
        item = row["itemId"]
        now = row["date"]
        cutoff = now - delta

        dates = item_dates[item]

        # Count efficiently with binary search
        left = np.searchsorted(dates, cutoff, side="right")
        right = np.searchsorted(dates, now, side="left")
        out.append(right - left)

    return out


In [3]:
def daysUntilNextHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (holidays - d).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def daysSinceLastHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (d - holidays).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def holidayProximityIndex(d, scale=30):
    """
    Returns a smooth value between -1 and +1 depending on
    distance to holidays. Neural networks LOVE this.
    Negative = after holiday
    Positive = before holiday
    """
    before = daysUntilNextHoliday(d)
    after = daysSinceLastHoliday(d)

    if pd.isna(before) and pd.isna(after):
        return 0

    # choose the nearest side (before or after)
    if before <= after:
        return +max(0, (scale - before) / scale)
    else:
        return -max(0, (scale - after) / scale)
####################################################################

def daysUntilBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d <= this_year:
        return (this_year - d).days
    else:
        next_year = pd.Timestamp(d.year + 1, bday.month, bday.day)
        return (next_year - d).days
####################################################################

def daysSinceBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d >= this_year:
        return (d - this_year).days
    else:
        last_year = pd.Timestamp(d.year - 1, bday.month, bday.day)
        return (d - last_year).days
####################################################################

def tempDeviation(actualTemp, avgTemp):
    """Signed deviation (continuous). Neural-network gold."""
    return actualTemp - avgTemp
####################################################################

def humidityDeviation(actualHumidity, avgHumidity):
    return actualHumidity - avgHumidity
####################################################################

def precipDeviation(actual, avg):
    return actual - avg
####################################################################

def daysUntilSchoolStart(d):
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    if d <= start:
        return (start - d).days
    else:
        next_start = pd.Timestamp(d.year + 1, 8, 15)
        return (next_start - d).days
####################################################################

def daysUntilSchoolEnd(d):
    d = pd.to_datetime(d)
    end = pd.Timestamp(d.year, 5, 31)
    if d <= end:
        return (end - d).days
    else:
        next_end = pd.Timestamp(d.year + 1, 5, 31)
        return (next_end - d).days
####################################################################

def schoolSeasonIndex(d):
    """
    Smooth 0→1 curve inside school season.
    <0 before season, >1 after.
    Good for neural nets.
    """
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    end   = pd.Timestamp(d.year, 5, 31)

    # If date is after Dec, school season continues in Jan–May.
    if d < start:
        return -((start - d).days) / 365.0
    elif start <= d <= end:
        return (d - start).days / (end - start).days
    else:
        return (d - end).days / 365.0

####################################################################


def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)
####################################################################


def apply_canonical(df, patterns, canonical_name, item_col="item"):
    df["itemCanonical"] = df["itemCanonical"].where(
        ~df[item_col].isin(patterns),
        canonical_name
    )
    return df


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)

In [4]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")


In [5]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"],
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

df_winndixie = pd.DataFrame(rows)

df_winndixie["date"] = pd.to_datetime(df_winndixie["date"])
df_winndixie["time"] = df_winndixie["time"].astype(str)
df_winndixie = remove_duplicate_receipt_files(df_winndixie)
df_winndixie = df_winndixie.sort_values(by=["date", "time"]).reset_index(drop=True)
df_winndixie.info()
df_winndixie["itemCanonical"] = df_winndixie["item"]
df_winndixie.info()

DUP: 2025-02-19 00:00:00 7:20 PM → keep IMG_9734.txt ← drop IMG_9735.txt
DUP: 2025-04-08 00:00:00 11:50 AM → keep IMG_9723.txt ← drop IMG_9724.txt
DUP: 2025-05-15 00:00:00 8:19 PM → keep IMG_9713.txt ← drop IMG_9714.txt
DUP: 2025-08-02 00:00:00 10:29 PM → keep IMG_9693.txt ← drop IMG_9694.txt
DUP: 2025-10-07 00:00:00 6:06 PM → keep IMG_0017.txt ← drop IMG_9669.txt
DUP: 2025-10-14 00:00:00 4:06 PM → keep IMG_0015.txt ← drop IMG_9667.txt
DUP: 2025-10-14 00:00:00 6:08 PM → keep IMG_0014.txt ← drop IMG_9666.txt
DUP: 2025-10-17 00:00:00 9:18 PM → keep IMG_0013.txt ← drop IMG_9664.txt
DUP: 2025-10-17 00:00:00 9:23 PM → keep IMG_0012.txt ← drop IMG_9663.txt
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856 entries, 0 to 855
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   source  856 non-null    object        
 1   date    856 non-null    datetime64[ns]
 2   time    856 non-null    object        
 3   item    856 

In [6]:
 milk_patterns = [
    "kleinpeter-milk",
    "know-and-love-milk",
    "fairlife-milk",
    #"goya-milk",
    "kandl-milk",
    "prairie-farm-milk",
    "kl-milk"
    #"milk-strw"
]

df_winndixie = apply_canonical(df_winndixie, milk_patterns, "milk")

In [7]:
### CREATE ITEM IDs


unique_items = sorted(df_winndixie["itemCanonical"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
df_winndixie["itemId"] = df_winndixie["itemCanonical"].map(item_to_id)
df_winndixie.reset_index(drop=True, inplace=True)
df_winndixie.info()
df_winndixie.head(100)

df_winndixie = df_winndixie.drop(columns=["item"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856 entries, 0 to 855
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   source         856 non-null    object        
 1   date           856 non-null    datetime64[ns]
 2   time           856 non-null    object        
 3   item           856 non-null    object        
 4   itemCanonical  856 non-null    object        
 5   itemId         856 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 40.2+ KB


In [8]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
df_winndixie["didBuy"] = 1

# 2. Build complete grid
all_items = df_winndixie["itemId"].unique()
all_dates = df_winndixie["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    df_winndixie[["date", "itemId", "itemCanonical", "source", "time", "didBuy"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy"] = df_full["didBuy"].fillna(0).astype(int)

# 5. NOW REPLACE df_winndixie with df_full
df_winndixie = df_full.copy()

df_winndixie.to_csv("df_fullreceipts.csv", index=False)


In [9]:
# %%
### Frequency per item

#df_winndixie["freq_7"]  = rolling_freq(df_winndixie, 7)
#df_winndixie["freq_15"]  = rolling_freq(df_winndixie, 15)
#df_winndixie["freq_30"] = rolling_freq(df_winndixie, 30)
#df_winndixie["freq_90"] = rolling_freq(df_winndixie, 90)
#df_winndixie["freq_180"] = rolling_freq(df_winndixie, 180)
#df_winndixie["freq_365"] = rolling_freq(df_winndixie, 365)

#df_winndixie.to_csv("df_winndixie_freq.csv", index=False)
#df_winndixie.info()
#df_winndixie.head(500)

In [10]:
# 1. Build grouped table (one row per trip date)

grouped = ( df_winndixie[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)

# 2. daysSinceLastTrip
grouped["daysSinceLastTrip"] = grouped["date"].diff().dt.days.fillna(0)

# 3. Holiday / Birthday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(schoolSeasonIndex)

dt = grouped["date"]
grouped["year"]    = dt.dt.year
grouped["month"]   = dt.dt.month
grouped["day"]     = dt.dt.day
grouped["dow"]     = dt.dt.dayofweek
grouped["doy"]     = dt.dt.dayofyear
grouped["quarter"] = dt.dt.quarter


BIRTHDAYS = {
    "steve":  "03-05-1980",  # fill with your real dates
    "maggie": "03-03-2016",
    "mil":    "01-27-1962",
    "angie":  "08-11-1981",
}

grouped["daysUntilBirthday_steve"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["steve"]))
grouped["daysSinceBirthday_steve"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["steve"]))
grouped["daysUntilBirthday_maggie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysSinceBirthday_maggie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysUntilBirthday_mil"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["mil"]))
grouped["daysSinceBirthday_mil"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["mil"]))
grouped["daysUntilBirthday_angie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["angie"]))
grouped["daysSinceBirthday_angie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["angie"]))

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

df_winndixie = df_winndixie.merge(grouped, on="date", how="left")
df_winndixie.info()
df_winndixie.head(10)
df_winndixie.to_csv("df_winndixie_group_level.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36428 entries, 0 to 36427
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      36428 non-null  datetime64[ns]
 1   itemId                    36428 non-null  int64         
 2   itemCanonical             856 non-null    object        
 3   source                    856 non-null    object        
 4   time                      856 non-null    object        
 5   didBuy                    36428 non-null  int32         
 6   daysSinceLastTrip         36428 non-null  float64       
 7   daysUntilNextHoliday      36428 non-null  int64         
 8   daysSinceLastHoliday      36428 non-null  int64         
 9   holidayProximityIndex     36428 non-null  float64       
 10  daysUntilSchoolStart      36428 non-null  int64         
 11  daysUntilSchoolEnd        36428 non-null  int64         
 12  schoolSeasonIndex 

In [11]:
# ============================================================
# INCREASING DAILY daysSinceLastPurchase (resets on purchase)
# ============================================================

df_winndixie = df_winndixie.sort_values(["itemId", "date"]).reset_index(drop=True)

# Start with NaN everywhere
df_winndixie["daysSinceLastPurchase"] = np.nan

# Set 0 on purchase days
df_winndixie.loc[df_winndixie["didBuy"] == 1, "daysSinceLastPurchase"] = 0

def fill_item(group):
    group = group.copy()
    # iterate row-by-row using positional index
    for i in range(1, len(group)):
        if pd.isna(group.iat[i, group.columns.get_loc("daysSinceLastPurchase")]):
            prev_val = group.iat[i-1, group.columns.get_loc("daysSinceLastPurchase")]
            trip_gap = group.iat[i, group.columns.get_loc("daysSinceLastTrip")]
            group.iat[i, group.columns.get_loc("daysSinceLastPurchase")] = prev_val + trip_gap
    return group

df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_item)

# Items with no purchase history get 999
df_winndixie["daysSinceLastPurchase"] = df_winndixie["daysSinceLastPurchase"].fillna(999)

df_winndixie.to_csv("df_winndixie_daysSinceLastPurchase.csv", index=False)


  df_winndixie = df_winndixie.groupby("itemId", group_keys=False).apply(fill_item)


In [13]:
### NORMALIZE TO ENCODED_DF
freq_cols = [c for c in df_winndixie.columns if c.startswith("freq_")]
weather_cols = [c for c in df_winndixie.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in df_winndixie.columns if "holiday" in c.lower()]
school_cols = [c for c in df_winndixie.columns if "school" in c.lower()]
birthday_cols = [c for c in df_winndixie.columns if c.startswith("daysUntilBirthday_") or c.startswith("daysSinceBirthday_")]

daysSince_purchase_cols = ["daysSinceLastPurchase"]
daysSince_trip_cols     = ["daysSinceLastTrip"]

encoded_df = df_winndixie.copy()

encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, birthday_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_purchase_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_trip_cols)

encoded_df.info();
encoded_df.head(100)

#encoded_df[["date", "itemId", "itemCanonical", "didBuy", "freq_7", "daysSinceLastPurchase_norm"]]

<class 'pandas.core.frame.DataFrame'>
Index: 36910 entries, 0 to 36909
Data columns (total 33 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   date                           36910 non-null  datetime64[ns]
 1   itemId                         36910 non-null  int64         
 2   item                           856 non-null    object        
 3   source                         856 non-null    object        
 4   time                           856 non-null    object        
 5   didBuy                         36910 non-null  int32         
 6   year                           36910 non-null  int32         
 7   month                          36910 non-null  int32         
 8   day                            36910 non-null  int32         
 9   dow                            36910 non-null  int32         
 10  doy                            36910 non-null  int32         
 11  quarter             

Unnamed: 0,date,itemId,item,source,time,didBuy,year,month,day,dow,doy,quarter,temp_5day_avg_norm,feelsLike_5day_avg_norm,dew_5day_avg_norm,humidity_5day_avg_norm,precip_5day_avg_norm,daysUntilNextHoliday_norm,daysSinceLastHoliday_norm,holidayProximityIndex_norm,daysUntilSchoolStart_norm,daysUntilSchoolEnd_norm,schoolSeasonIndex_norm,daysUntilBirthday_steve_norm,daysSinceBirthday_steve_norm,daysUntilBirthday_maggie_norm,daysSinceBirthday_maggie_norm,daysUntilBirthday_mil_norm,daysSinceBirthday_mil_norm,daysUntilBirthday_angie_norm,daysSinceBirthday_angie_norm,daysSinceLastPurchase_norm,daysSinceLastTrip_norm
0,2024-11-15,0,,,,0,2024,11,15,4,320,4,-0.002085,-0.143321,0.189949,0.708408,-0.201931,-0.467575,-0.872996,-1.241696,0.707388,-0.01257,1.16919,-0.596115,0.596115,-0.631447,0.631447,-0.924602,0.962751,0.644886,-0.613859,0.0,-1.248909
1,2024-11-22,0,,,,0,2024,11,22,4,327,4,-0.474928,-0.513951,-0.765386,-1.206119,1.159999,-0.805754,-0.525403,1.13476,0.643776,-0.085296,1.219392,-0.668539,0.668539,-0.703467,0.703467,-0.992577,1.030281,0.580697,-0.549299,0.0,1.006574
2,2024-11-24,0,,,,0,2024,11,24,6,329,4,-0.99439,-0.933826,-1.33476,-1.554215,-0.77574,-0.902376,-0.426091,1.229819,0.625601,-0.106074,1.233735,-0.689232,0.689232,-0.724044,0.724044,-1.011999,1.049575,0.562357,-0.530854,0.0,-0.604485
3,2024-11-27,0,,,,0,2024,11,27,2,332,4,-0.623108,-0.642246,-0.269377,1.230552,-0.694755,-1.04731,-0.277123,1.372406,0.598338,-0.137242,1.25525,-0.720271,0.720271,-0.754909,0.754909,-1.041131,1.078517,0.534848,-0.503185,0.0,-0.282274
4,2024-12-02,0,,,,0,2024,12,2,0,337,4,-1.400635,-1.283722,-1.601105,-1.274225,-0.763888,0.015538,-0.872996,-1.241696,0.552901,-0.189189,1.291108,-0.772002,0.772002,-0.806352,0.806352,-1.089685,1.126752,0.488998,-0.457071,0.0,0.36215
5,2024-12-04,0,,,,0,2024,12,4,2,339,4,-1.605423,-1.450894,-1.712747,-0.971533,-0.764876,-0.081085,-0.773684,-1.146638,0.534726,-0.209968,1.305452,-0.792695,0.792695,-0.826928,0.826928,-1.109107,1.146046,0.470658,-0.438626,0.0,-0.604485
6,2024-12-07,0,,,,0,2024,12,7,5,342,4,-1.587109,-1.479404,-1.625029,-0.736946,0.487428,-0.226018,-0.624716,-1.004051,0.507464,-0.241136,1.326966,-0.823734,0.823734,-0.857794,0.857794,-1.138239,1.174988,0.443149,-0.410957,0.0,-0.282274
7,2024-12-12,0,,,,0,2024,12,12,3,347,4,-0.916137,-0.90402,-0.72073,0.524271,0.090405,-0.467575,-0.376435,0.802056,0.462026,-0.293082,1.362825,-0.875465,0.875465,-0.909236,0.909236,-1.186793,1.223223,0.397299,-0.364843,0.0,0.36215
8,2024-12-28,0,,,,0,2024,12,28,5,363,4,-0.531536,-0.5619,0.060764,2.315199,2.207865,-0.902376,-0.922652,-1.289225,0.316627,-0.459312,1.477571,-1.041006,1.041006,-1.073852,1.073852,-1.342166,1.377578,0.250581,-0.217279,0.0,3.90648
9,2024-12-29,0,,,,0,2024,12,29,6,364,4,-0.521546,-0.554124,0.086282,2.388349,2.257246,-0.950687,-0.872996,1.277348,0.307539,-0.469701,1.484743,-1.051352,1.051352,-1.084141,1.084141,-1.351876,1.387225,0.241411,-0.208056,0.0,-0.926697


In [None]:
# ============================================================
# 4. SINE COSINE on CYCLICAL FEATURES
# ============================================================

encoded_df["dow_sin"] =   np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"] =   np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["doy_sin"] =   np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"] =   np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["dow","month","doy"])

## NON-CYCLIC TIME FEATURES
nonCycCols = ["year","day","quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)
#

cols_to_drop = ["source","manager","time", "cashier", "qty", "itemCanonical", "reg", "youPay", "reportedItemsSold", "qtyMatchReported", "shopper" ,"date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

encoded_df.to_csv("encoded.csv", index=False)
encoded_df.info()
encoded_df.head(10)


# TRAIN / BUILD MODEL

In [None]:
# ============================================================
# FULLY SELF-CONTAINED TRAINING + SAVING METHODS (FINAL)
# ============================================================

import os, json
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split


# ============================================================
# TRAINING PROGRESS CALLBACK
# ============================================================
class EpochStatus(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(
            f"Epoch {epoch+1}: "
            f"loss={logs.get('loss'):.4f}  "
            f"val_loss={logs.get('val_loss'):.4f}  "
            f"acc={logs.get('accuracy'):.4f}"
        )

# ============================================================
# SAVE EXPERIMENT (MODEL, METRICS, HYPERPARAMS)
# ============================================================
def save_experiment(model, history, HP):

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    name = f"exp_e{HP['epochs']}_b{HP['batch_size']}_lr{HP['learning_rate']}"
    exp_dir = os.path.join("experiments", f"{timestamp}_{name}")
    os.makedirs(exp_dir, exist_ok=True)

    # Save hyperparameters
    with open(os.path.join(exp_dir, "hyperparams.json"), "w") as f:
        json.dump(HP, f, indent=4)

    # Save metrics CSV
    pd.DataFrame(history.history).to_csv(
        os.path.join(exp_dir, "training_metrics.csv"), index=False
    )

    # Save model architecture
    with open(os.path.join(exp_dir, "model_arch.json"), "w") as f:
        f.write(model.to_json())

    # Save weights
    model.save_weights(os.path.join(exp_dir, "model.weights.h5"))

    print(f"\nSaved experiment to: {exp_dir}")

    return exp_dir
##################################################################################

def get_last_trip_weather_norm(df_raw, encoded_df):
    """
    Returns a dict of { weather_norm_col: value } from the last trip.
    """
    last_trip_date = df_raw["date"].max()
    mask = df_raw["date"] == last_trip_date
    idx = df_raw.index[mask][0]

    weather_cols = [
        "temp_5day_avg_norm",
        "feelsLike_5day_avg_norm",
        "dew_5day_avg_norm",
        "humidity_5day_avg_norm",
        "precip_5day_avg_norm",
    ]

    weather = {}
    for col in weather_cols:
        weather[col] = encoded_df.loc[idx, col]

    return weather, last_trip_date
##################################################################################


def build_today_trip_features(df_raw, encoded_df):
    """
    Builds all trip-level features for *today*, normalized,
    using last trip for weather and daysSinceLastTrip.
    """
    today = pd.Timestamp.today().normalize()

    weather_norm, last_trip_date = get_last_trip_weather_norm(df_raw, encoded_df)

    # --- raw trip-level values for today ---
    days_since_last_trip_raw = (today - last_trip_date).days

    d_unext  = daysUntilNextHoliday(today)
    d_slast  = daysSinceLastHoliday(today)
    h_index  = holidayProximityIndex(today)

    d_ss     = daysUntilSchoolStart(today)
    d_se     = daysUntilSchoolEnd(today)
    s_index  = schoolSeasonIndex(today)

    year_raw     = today.year
    month_raw    = today.month
    day_raw      = today.day
    dow_raw      = today.dayofweek        # 0–6
    doy_raw      = today.dayofyear        # 1–366
    quarter_raw  = (month_raw - 1) // 3 + 1

    # birthdays
    d_steve_u = daysUntilBirthday(today, BIRTHDAYS["steve"])
    d_steve_s = daysSinceBirthday(today, BIRTHDAYS["steve"])
    d_mag_u   = daysUntilBirthday(today, BIRTHDAYS["maggie"])
    d_mag_s   = daysSinceBirthday(today, BIRTHDAYS["maggie"])
    d_mil_u   = daysUntilBirthday(today, BIRTHDAYS["mil"])
    d_mil_s   = daysSinceBirthday(today, BIRTHDAYS["mil"])
    d_ang_u   = daysUntilBirthday(today, BIRTHDAYS["angie"])
    d_ang_s   = daysSinceBirthday(today, BIRTHDAYS["angie"])

    # --- normalize using training stats ---
    feats = {}

    feats["daysSinceLastTrip_norm"]        = normalize_single_value(df_raw, "daysSinceLastTrip",       days_since_last_trip_raw)
    feats["daysUntilNextHoliday_norm"]     = normalize_single_value(df_raw, "daysUntilNextHoliday",    d_unext)
    feats["daysSinceLastHoliday_norm"]     = normalize_single_value(df_raw, "daysSinceLastHoliday",    d_slast)
    feats["holidayProximityIndex_norm"]    = normalize_single_value(df_raw, "holidayProximityIndex",   h_index)

    feats["daysUntilSchoolStart_norm"]     = normalize_single_value(df_raw, "daysUntilSchoolStart",    d_ss)
    feats["daysUntilSchoolEnd_norm"]       = normalize_single_value(df_raw, "daysUntilSchoolEnd",      d_se)
    feats["schoolSeasonIndex_norm"]        = normalize_single_value(df_raw, "schoolSeasonIndex",       s_index)

    feats["year_norm"]                     = normalize_single_value(df_raw, "year",                    year_raw)
    feats["day_norm"]                      = normalize_single_value(df_raw, "day",                     day_raw)
    feats["quarter_norm"]                  = normalize_single_value(df_raw, "quarter",                 quarter_raw)

    feats["daysUntilBirthday_steve_norm"]  = normalize_single_value(df_raw, "daysUntilBirthday_steve", d_steve_u)
    feats["daysSinceBirthday_steve_norm"]  = normalize_single_value(df_raw, "daysSinceBirthday_steve", d_steve_s)
    feats["daysUntilBirthday_maggie_norm"] = normalize_single_value(df_raw, "daysUntilBirthday_maggie", d_mag_u)
    feats["daysSinceBirthday_maggie_norm"] = normalize_single_value(df_raw, "daysSinceBirthday_maggie", d_mag_s)
    feats["daysUntilBirthday_mil_norm"]    = normalize_single_value(df_raw, "daysUntilBirthday_mil",    d_mil_u)
    feats["daysSinceBirthday_mil_norm"]    = normalize_single_value(df_raw, "daysSinceBirthday_mil",    d_mil_s)
    feats["daysUntilBirthday_angie_norm"]  = normalize_single_value(df_raw, "daysUntilBirthday_angie",  d_ang_u)
    feats["daysSinceBirthday_angie_norm"]  = normalize_single_value(df_raw, "daysSinceBirthday_angie",  d_ang_s)

    # cyclical encodings for today (no normalization)
    feats["dow_sin"]   = np.sin(2 * np.pi * dow_raw / 7.0)
    feats["dow_cos"]   = np.cos(2 * np.pi * dow_raw / 7.0)
    feats["month_sin"] = np.sin(2 * np.pi * (month_raw - 1) / 12.0)
    feats["month_cos"] = np.cos(2 * np.pi * (month_raw - 1) / 12.0)
    feats["doy_sin"]   = np.sin(2 * np.pi * (doy_raw - 1) / 365.0)
    feats["doy_cos"]   = np.cos(2 * np.pi * (doy_raw - 1) / 365.0)

    # copy over last-trip normalized weather as requested
    feats.update(weather_norm)

    return feats
##################################################################################


def build_item_features_today(item_id, df_raw):
    """
    Compute daysSinceLastPurchase_norm for this item, for *today*.
    """
    today = pd.Timestamp.today().normalize()

    mask = df_raw["itemId"] == item_id
    last_purchase_date = df_raw.loc[mask, "date"].max()

    days_since_last_purchase_raw = (today - last_purchase_date).days

    feats = {}
    feats["daysSinceLastPurchase_norm"] = normalize_single_value(
        df_raw,
        "daysSinceLastPurchase",
        days_since_last_purchase_raw
    )
    return feats
##################################################################################


def build_feature_row_for_item_today(item_id, df_raw, encoded_df, trip_feats, feature_cols):
    """
    Build a 1D np.array in the exact column order that the model was trained on.
    """
    item_feats = build_item_features_today(item_id, df_raw)

    values = {}

    # identity
    values["itemId"] = item_id

    # trip-level
    values.update(trip_feats)

    # item-level
    values.update(item_feats)

    # now order according to feature_cols
    row = []
    for col in feature_cols:
        if col == "didBuy":
            continue
        row.append(values[col])
    return np.array(row, dtype=np.float32)
##################################################################################


def predict_all_items_today(model, df_raw, encoded_df):
    """
    Runs EVERY itemId through the model for *today*
    and returns ALL items with item names and probabilities,
    sorted highest → lowest.
    """

    # build itemId -> itemName lookup
    item_lookup = (
        df_raw[["itemId", "itemCanonical"]]
        .drop_duplicates()
        .set_index("itemId")["itemCanonical"]
        .to_dict()
    )

    # exact training feature order
    feature_cols = [c for c in encoded_df.columns if c != "didBuy"]

    # shared trip features for today
    trip_feats = build_today_trip_features(df_raw, encoded_df)

    # all unique itemIds
    item_ids = sorted(df_raw["itemId"].unique())

    X_rows = []
    items = []

    for item_id in item_ids:
        row = build_feature_row_for_item_today(
            item_id,
            df_raw,
            encoded_df,
            trip_feats,
            feature_cols
        )
        X_rows.append(row)
        items.append(item_id)

    X = np.stack(X_rows, axis=0)

    # predict
    probs = model.predict(X).reshape(-1)

    # assemble output
    result = pd.DataFrame({
        "itemId": items,
        "itemName": [item_lookup[i] for i in items],
        "prob": probs
    })

    result = result.sort_values("prob", ascending=False)

    return result
##################################################################################

def predict_next_trip(model, encoded_df, input_feature_cols, frequent_items):
    """
    Build a single input row for prediction using the most recent trip in encoded_df.
    Returns a sorted DataFrame of predicted probabilities.
    """

    # ------------------------------
    # 1. Get the most recent row (latest trip)
    # ------------------------------
    last = encoded_df.iloc[-1]

    # ------------------------------
    # 2. Build a new row using last-known feature values
    # ------------------------------
    x = {}

    for col in input_feature_cols:
        if col in encoded_df.columns:
            x[col] = last[col]
        else:
            # safety: unknown column
            x[col] = 0.0

    # Convert to model input shape
    X_input = np.array([x[col] for col in input_feature_cols], dtype=np.float32)
    X_input = X_input.reshape(1, -1)

    # ------------------------------
    # 3. Predict probabilities
    # ------------------------------
    y_pred = model.predict(X_input)[0]   # shape: (num_items,)

    # ------------------------------
    # 4. Build labeled output table
    # ------------------------------
    result = pd.DataFrame({
        "itemCanonical": frequent_items,
        "probability": y_pred
    })

    # Sort highest-probability first
    result = result.sort_values(by="probability", ascending=False)

    return result
##################################################################################

def normalize_single_value(df_raw, col_name, raw_value):
    """
    Use the same mean/std logic as normalizeAndDropCols for a single scalar.
    """
    col = df_raw[col_name].replace(999, np.nan)
    mean = col.mean()
    std = col.std() or 1.0

    if pd.isna(raw_value):
        return 0.0

    return (raw_value - mean) / std
##################################################################################

def save_predictions(pred_df, experiment_name):
    out_dir = f"experiments/{experiment_name}"
    os.makedirs(out_dir, exist_ok=True)
    pred_df.to_csv(f"{out_dir}/predictions.csv", index=False)


In [None]:
hidden_sizes = [ 64,128,256, 512]

# --------------------------------
# Fixed hyperparameters
# --------------------------------
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.0001

# Training data
feature_cols = [
    c for c in encoded_df.columns
    if c != "didBuy" and np.issubdtype(encoded_df[c].dtype, np.number)
]

X = encoded_df[feature_cols].to_numpy(dtype=np.float32)
y = encoded_df["didBuy"].to_numpy(dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

# Class weights for imbalance
pos = y_train.sum()
neg = len(y_train) - pos
cw = min(50.0, neg / (pos + 1e-6))
class_weights = {0: cw}

# ===================================================================
# LOOP OVER HIDDEN SIZES
# ===================================================================
for h in hidden_sizes:

    print("\n==============================")
    print(f" RUNNING EXPERIMENT: hidden={h}")
    print("==============================")

    # ---------------------------------------------------------------
    # HP for this experiment
    # ---------------------------------------------------------------
    HP = {
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "hidden": h
    }

    # ---------------------------------------------------------------
    # Build model
    # ---------------------------------------------------------------
    model = models.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(h, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    # ---------------------------------------------------------------
    # Train
    # ---------------------------------------------------------------
    history = model.fit(
        X_train, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        class_weight=class_weights,
        verbose=0,
        callbacks=[EpochStatus()]
    )

    # ---------------------------------------------------------------
    # Save (model, history, HP)
    # ---------------------------------------------------------------
    exp_name = f"exp_hidden{h}"
    save_experiment(model=model, history=history, HP=HP)

    # ---------------------------------------------------------------
    # Run predictions on ALL items for TODAY
    # ---------------------------------------------------------------
    pred_df = predict_all_items_today(model, df_winndixie, encoded_df)

    # ---------------------------------------------------------------
    # Save predictions
    # ---------------------------------------------------------------
    save_predictions(pred_df, exp_name)

    print(f"Completed: {exp_name}")
    print(pred_df.head())
