In [None]:

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import gc
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

In [None]:
def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|" 
        #df["qty"].astype(str) + "|" +
        #df["youPay"].astype(str) + "|" +
        #df["reg"].astype(str) + "|" +
        #df["reportedItemsSold"].astype(str) + "|" +
        #df["cashier"].astype(str) + "|" +
        #df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result
#################################################################

# def rolling_freq(df, window_days):
#     out = []
#     for idx, row in df.iterrows():
#         item = row["item"]
#         cutoff = row["date"] - pd.Timedelta(days=window_days)
#         count = df[(df["item"] == item) &
#                    (df["date"] > cutoff) &
#                    (df["date"] < row["date"])].shape[0]
#         out.append(count)
#     return out

In [None]:
def daysUntilNextHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (holidays - d).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def daysSinceLastHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (d - holidays).days
    diffs = diffs[diffs >= 0]
    return diffs.min() if len(diffs) > 0 else np.nan
####################################################################

def holidayProximityIndex(d, scale=30):
    """
    Returns a smooth value between -1 and +1 depending on
    distance to holidays. Neural networks LOVE this.
    Negative = after holiday
    Positive = before holiday
    """
    before = daysUntilNextHoliday(d)
    after = daysSinceLastHoliday(d)

    if pd.isna(before) and pd.isna(after):
        return 0

    # choose the nearest side (before or after)
    if before <= after:
        return +max(0, (scale - before) / scale)
    else:
        return -max(0, (scale - after) / scale)
####################################################################

def daysUntilBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d <= this_year:
        return (this_year - d).days
    else:
        next_year = pd.Timestamp(d.year + 1, bday.month, bday.day)
        return (next_year - d).days
####################################################################

def daysSinceBirthday(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year = pd.Timestamp(d.year, bday.month, bday.day)
    if d >= this_year:
        return (d - this_year).days
    else:
        last_year = pd.Timestamp(d.year - 1, bday.month, bday.day)
        return (d - last_year).days
####################################################################

def tempDeviation(actualTemp, avgTemp):
    """Signed deviation (continuous). Neural-network gold."""
    return actualTemp - avgTemp
####################################################################

def humidityDeviation(actualHumidity, avgHumidity):
    return actualHumidity - avgHumidity
####################################################################

def precipDeviation(actual, avg):
    return actual - avg
####################################################################

def daysUntilSchoolStart(d):
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    if d <= start:
        return (start - d).days
    else:
        next_start = pd.Timestamp(d.year + 1, 8, 15)
        return (next_start - d).days
####################################################################

def daysUntilSchoolEnd(d):
    d = pd.to_datetime(d)
    end = pd.Timestamp(d.year, 5, 31)
    if d <= end:
        return (end - d).days
    else:
        next_end = pd.Timestamp(d.year + 1, 5, 31)
        return (next_end - d).days
####################################################################

def schoolSeasonIndex(d):
    """
    Smooth 0→1 curve inside school season.
    <0 before season, >1 after.
    Good for neural nets.
    """
    d = pd.to_datetime(d)
    start = pd.Timestamp(d.year, 8, 15)
    end   = pd.Timestamp(d.year, 5, 31)

    # If date is after Dec, school season continues in Jan–May.
    if d < start:
        return -((start - d).days) / 365.0
    elif start <= d <= end:
        return (d - start).days / (end - start).days
    else:
        return (d - end).days / 365.0

####################################################################


def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)



def canonicalize_items(df, patterns, canonical_name):
    """
    For each pattern in `patterns`, find rows where `item` contains the pattern
    and replace df['item'] with `canonical_name`.
    """
    for p in patterns:
        mask = df["item"].str.contains(p, case=False, na=False)
        df.loc[mask, "item"] = canonical_name


In [None]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")

#grouped.to_csv("grouped.csv", index=False)
#grouped.info()


In [None]:
import os
import pandas as pd

def ImportWallMart(folder_path: str) -> pd.DataFrame:
    """
    Import all Walmart receipt CSV files from a folder.
    Adds a 'source' column set to the CSV filename.
    """
    dataframes = []

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            dataframe = pd.read_csv(file_path)
            dataframe["source"] = file_name
            dataframes.append(dataframe)

    if len(dataframes) == 0:
        return pd.DataFrame()

    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    return combined_dataframe


In [None]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"]
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

winndixie_df = pd.DataFrame(rows)

winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])
winndixie_df["time"] = winndixie_df["time"].astype(str)
winndixie_df = remove_duplicate_receipt_files(winndixie_df)
winndixie_df = winndixie_df.sort_values(by=["date", "time"]).reset_index(drop=True)
winndixie_df = winndixie_df.drop(columns=["time"])

In [None]:
wallmart_raw = ImportWallMart("./walmart")
wallmart_raw["Product Description"] = (
    wallmart_raw["Product Description"]
    .str.replace("Great Value", "", regex=False)
    .str.replace("Freshness Guaranteed", "", regex=False)
    .str.strip()
)

## remove some non-food items
wallmart_raw = wallmart_raw[
    ~wallmart_raw["Product Description"].str.contains("Mainstays", case=False, na=False)
    &
    ~wallmart_raw["Product Description"].str.contains("Sizes", case=False, na=False)
    &
    ~wallmart_raw["Product Description"].str.contains("Pen+Gear", case=False, na=False, regex=False)
    &
    ~wallmart_raw["Product Description"].str.contains("Athletic", case=False, na=False)  
]

## rename cols
wallmart_df = wallmart_raw[["Order Date","Product Description", "source"]].copy()
wallmart_df = wallmart_df.rename(columns={
    "Order Date": "date",
    "Product Description": "item"
})

wallmart_df["date"] = pd.to_datetime(wallmart_df["date"])
wallmart_df.to_csv("wallmart_df.csv", index=False)

winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])

combined_df = pd.concat(
    [winndixie_df, wallmart_df[["date", "item", "source"]]],
    ignore_index=True
)

combined_df.info()
combined_df.head()
combined_df.to_csv("combined_df.csv", index=False)

In [None]:
milk_patterns = ["know-and-love-milk", "kandl-milk", "prairie-farm-milk","kleinpeter-milk", "kl-milk", "Milk, Fat Free,", "Fat-Free Milk"]
canonicalize_items(combined_df, milk_patterns, "milk")

bread_patterns = ["bunny-bread","se-grocers-bread","seg-sandwich-bread", "seg-white-bread"]
canonicalize_items(combined_df, bread_patterns, "bread")

cheese_patterns = ["dandw-cheese", "kraft-cheese", "se-grocers-cheese", "know-and-love-cheese"]
canonicalize_items(combined_df, cheese_patterns, "cheese")

mayo_patterns = ["blue-plate-mayo", "blue-plate-mynnase"]
canonicalize_items(combined_df, mayo_patterns, "mayo")

chicken_patterns = ["chicken-cutlet", "chicken-leg", "chicken-thigh", "chicken-thighs"]
canonicalize_items(combined_df, chicken_patterns, "chicken")

yogurt_patterns = ["chobani-yogrt-flip", "chobani-yogurt"]
canonicalize_items(combined_df, yogurt_patterns, "yogurt")

coke_patterns = ["coca-cola", "coca-cola-cola", "cocacola-soda"]
canonicalize_items(combined_df, coke_patterns, "coke")

hugbi_patterns = ["hugbi-pies", "-hugbi-pies"]
canonicalize_items(combined_df, hugbi_patterns, "hugbi-pies")

minute_maid_patterns = ["minute-maid-drink", "minute-maid-drinks", "minute-maid-lmnade"]
canonicalize_items(combined_df, minute_maid_patterns, "minute-maid-drink")



In [None]:
### CREATE ITEM IDs
unique_items = sorted(combined_df["item"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
combined_df["itemId"] = combined_df["item"].map(item_to_id)
combined_df.reset_index(drop=True, inplace=True)
combined_df.info()
combined_df.head(100)

In [None]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
combined_df["didBuy"] = 1

# 2. Build complete grid
all_items = combined_df["itemId"].unique()
all_dates = combined_df["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    combined_df[["date", "itemId", "item", "source", "didBuy"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy"] = df_full["didBuy"].fillna(0).astype(int)

# 5. NOW REPLACE combined_df with df_full
combined_df = df_full.copy()

combined_df.to_csv("df_fullreceipts.csv", index=False)


In [None]:
# 1. Build grouped table (one row per trip date)

grouped = ( combined_df[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)

# 2. daysSinceLastTrip
grouped["daysSinceLastTrip"] = grouped["date"].diff().dt.days.fillna(0)

# 3. Holiday / Birthday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(schoolSeasonIndex)

dt = grouped["date"]
grouped["year"]    = dt.dt.year
grouped["month"]   = dt.dt.month
grouped["day"]     = dt.dt.day
grouped["dow"]     = dt.dt.dayofweek
grouped["doy"]     = dt.dt.dayofyear
grouped["quarter"] = dt.dt.quarter

BIRTHDAYS = {
    "steve":  "03-05-1980",  # fill with your real dates
    "maggie": "03-03-2016",
    "mil":    "01-27-1962",
    "angie":  "08-11-1981",
}

grouped["daysUntilBirthday_steve"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["steve"]))
grouped["daysSinceBirthday_steve"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["steve"]))
grouped["daysUntilBirthday_maggie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysSinceBirthday_maggie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["maggie"]))
grouped["daysUntilBirthday_mil"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["mil"]))
grouped["daysSinceBirthday_mil"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["mil"]))
grouped["daysUntilBirthday_angie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, BIRTHDAYS["angie"]))
grouped["daysSinceBirthday_angie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, BIRTHDAYS["angie"]))

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

combined_df = combined_df.merge(grouped, on="date", how="left")
combined_df.info()
combined_df.head(10)

combined_df.to_csv("df_merged_group_level_features.csv", index=False)

In [None]:
# ================================================
# FREQUENCY WINDOWS (7, 15, 30, 90, 365)
# True rolling-window implementation
# ================================================
def fill_freq(group):
    group = group.copy()
    group = group.sort_values("date").reset_index(drop=True)

    history = []

    col_date = group.columns.get_loc("date")
    col_buy = group.columns.get_loc("didBuy")
    col_freq = {w: group.columns.get_loc(f"freq_{w}") for w in freq_windows}

    for i in range(len(group)):
        cur_date = group.iat[i, col_date]

        # record purchase
        if group.iat[i, col_buy] == 1:
            history.append(cur_date)

        # prune history ONCE using largest window
        cutoff_max = cur_date - pd.Timedelta(days=max_w)
        history = [d for d in history if d >= cutoff_max]

        # compute windowed counts
        for w in freq_windows:
            cutoff = cur_date - pd.Timedelta(days=w)
            count = 0
            for d in history:
                if d >= cutoff:
                    count += 1
            group.iat[i, col_freq[w]] = count

    return group
#######################################################
freq_windows = [7, 15, 30, 90, 365]
max_w = max(freq_windows)

# initialize columns
for w in freq_windows:
    combined_df[f"freq_{w}"] = np.nan

combined_df = (
    combined_df
    .groupby("itemId", group_keys=False)
    .apply(fill_freq)
)


In [None]:
# ============================================================
# INCREASING DAILY daysSinceLastPurchase (resets on purchase)
# ============================================================
def fill_item(group):
    group = group.copy()
    # iterate row-by-row using positional index
    for i in range(1, len(group)):
        if pd.isna(group.iat[i, group.columns.get_loc("daysSinceLastPurchase")]):
            prev_val = group.iat[i-1, group.columns.get_loc("daysSinceLastPurchase")]
            trip_gap = group.iat[i, group.columns.get_loc("daysSinceLastTrip")]
            group.iat[i, group.columns.get_loc("daysSinceLastPurchase")] = prev_val + trip_gap
    return group
##########################################################################################

combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

# Start with NaN everywhere
combined_df["daysSinceLastPurchase"] = np.nan

# Set 0 on purchase days
combined_df.loc[combined_df["didBuy"] == 1, "daysSinceLastPurchase"] = 0
combined_df = combined_df.groupby("itemId", group_keys=False).apply(fill_item)

# Items with no purchase history get 999
combined_df["daysSinceLastPurchase"] = combined_df["daysSinceLastPurchase"].fillna(999)
combined_df.to_csv("daysSinceLastPurchase.csv", index=False)

In [None]:
# ============================================================
# ITEM-LEVEL HABIT FEATURES (TF-IDF ANALOG)
# ============================================================
import numpy as np
import pandas as pd

def build_habit_features(df, tau_days=120):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    total_trips = df["date"].nunique()
    timeline_days = (df["date"].max() - df["date"].min()).days or 1

    rows = []

    for itemId, g in df.groupby("itemId"):
        buys = g[g["didBuy"] == 1]["date"]

        if len(buys) == 0:
            rows.append({
                "itemId": itemId,
                "habitFrequency": 0.0,
                "habitSpan": 0.0,
                "habitDecay": 0.0,
            })
            continue

        first = buys.min()
        last = buys.max()

        habitFrequency = len(buys) / total_trips
        habitSpan = (last - first).days / timeline_days
        days_since_last = (df["date"].max() - last).days
        habitDecay = np.exp(-days_since_last / tau_days)

        rows.append({
            "itemId": itemId,
            "habitFrequency": habitFrequency,
            "habitSpan": habitSpan,
            "habitDecay": habitDecay,
        })

    return pd.DataFrame(rows)
###############################################################################


def compute_due_score(
    df,
    itemId=None,
    use_sigmoid=True,
    normalize=False,
    weights=None
):
    """
    Compute due_score from RAW (non-normalized) features.

    Required columns:
      - itemId
      - daysSinceLastPurchase
      - freq_30
      - freq_90

    Parameters
    ----------
    df : DataFrame

    itemId : int | None
        If provided, compute only for this itemId.
        If None, compute for all items.

    use_sigmoid : bool
        Apply sigmoid → (0,1)

    normalize : bool
        Z-normalize instead (ignored if use_sigmoid=True)

    weights : dict | None
        Optional override for feature weights
    """

    if weights is None:
        weights = {
            "daysSinceLastPurchase": 1.5,
            "freq_30": 1.0,
            "freq_90": 0.5
        }

    # --------------------------------------------------------
    # Optional itemId filter
    # --------------------------------------------------------
    if itemId is not None:
        df = df[df["itemId"] == itemId].copy()
    else:
        df = df.copy()

    # --------------------------------------------------------
    # RAW linear score (pre-normalization)
    # --------------------------------------------------------
    df["due_score_raw"] = (
        weights["daysSinceLastPurchase"] * df["daysSinceLastPurchase"]
      + weights["freq_30"]              * df["freq_30"]
      + weights["freq_90"]              * df["freq_90"]
    )

    # --------------------------------------------------------
    # Final due_score
    # --------------------------------------------------------
    if use_sigmoid:
        df["due_score"] = 1 / (1 + np.exp(-df["due_score_raw"]))

    elif normalize:
        mean = df["due_score_raw"].mean()
        std  = df["due_score_raw"].std() or 1.0
        df["due_score"] = (df["due_score_raw"] - mean) / std

    else:
        df["due_score"] = df["due_score_raw"]

    return df
###############################################################################


# ============================================================
# MERGE HABIT FEATURES
# ============================================================
habit_df = build_habit_features(combined_df)

combined_df = combined_df.merge(
    habit_df,
    on="itemId",
    how="left"
)

combined_df[["habitFrequency", "habitSpan", "habitDecay"]] = (
    combined_df[["habitFrequency", "habitSpan", "habitDecay"]].fillna(0.0)
)



In [None]:
## trim fat
# find rows with freq_365 of 1 or less

In [None]:
# ============================================================
# NORMALIZE TO ENCODED_DF
# ============================================================

freq_cols = [c for c in combined_df.columns if c.startswith("freq_")]
weather_cols = [c for c in combined_df.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in combined_df.columns if "holiday" in c.lower()]
school_cols = [c for c in combined_df.columns if "school" in c.lower()]
birthday_cols = [
    c for c in combined_df.columns
    if c.startswith("daysUntilBirthday_") or c.startswith("daysSinceBirthday_")
]

daysSince_purchase_cols = ["daysSinceLastPurchase"]
daysSince_trip_cols     = ["daysSinceLastTrip"]

habit_cols = ["habitFrequency", "habitSpan", "habitDecay"]

encoded_df = combined_df.copy()
encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, birthday_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_purchase_cols)
encoded_df = normalizeAndDropCols(encoded_df, daysSince_trip_cols)
encoded_df = normalizeAndDropCols(encoded_df, habit_cols)

encoded_df.info()
encoded_df.head(100)


In [None]:

import numpy as np

# ---------- CYCLICAL FEATURES ----------
encoded_df["dow_sin"]   = np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"]   = np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["doy_sin"]   = np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"]   = np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["dow", "month", "doy"], errors="ignore")

# ---------- NON-CYCLIC TIME FEATURES ----------
nonCycCols = ["year", "day", "quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)

# ---------- DROP NON-MODEL COLS ----------
cols_to_drop = ["source", "item", "date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

# ---------- FINAL CHECK ----------
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)

encoded_df.info()

In [None]:
encoded_df["due_score"] = (
    1.5 * encoded_df["daysSinceLastPurchase_norm"]
  + 1.0 * encoded_df["freq_30_norm"]
  ##+ 0.5 * encoded_df["freq_90_norm"]
)

####encoded_df["due_score"] = 1 / (1 + np.exp(-encoded_df["due_score"]))


encoded_df.info()
encoded_df.head()

encoded_df.to_csv("encoded.csv", index=False)

# TRAIN / BUILD MODEL

In [None]:
tf.__version__

In [None]:
def save_experiment( model, history, predictions, params, numeric_cols, item_id_to_idx, base_dir="experiments"):
    """
    Saves experiment artifacts with a clean, readable folder name.
    Folder name encodes only primary structural decisions:
      - embedding size
      - hidden layer layout
      - epoch count
    """
    name_parts = []

    if "embedding_dim" in params:
        name_parts.append(f"emb{params['embedding_dim']}")
    if "hiddenLayers" in params:
        hl = "-".join(str(x) for x in params["hiddenLayers"])
        name_parts.append(f"hl{hl}")
    if "epochs" in params:
        name_parts.append(f"ep{params['epochs']}")

    exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
    exp_dir = os.path.join(base_dir, exp_name)
    os.makedirs(exp_dir, exist_ok=True)

    # ------------------------------------------------------------
    # Save artifacts (unchanged)
    # ------------------------------------------------------------
    model.save(os.path.join(exp_dir, "model"))
    model.save_weights(os.path.join(exp_dir, "weights.h5"))

    with open(os.path.join(exp_dir, "history.json"), "w") as f:
        json.dump(history.history, f, indent=2)

    with open(os.path.join(exp_dir, "numeric_features.json"), "w") as f:
        json.dump(numeric_cols, f, indent=2)

    with open(os.path.join(exp_dir, "item_id_to_idx.json"), "w") as f:
        json.dump(
            {str(int(k)): int(v) for k, v in item_id_to_idx.items()},
            f,
            indent=2
        )

    with open(os.path.join(exp_dir, "hyperparams.json"), "w") as f:
        json.dump(params, f, indent=2)

    predictions.to_csv(os.path.join(exp_dir, "predictions.csv"), index=False)

    print("Saved experiment →", exp_dir)
##########################################################################################

def build_and_compile_model( num_numeric_features, num_items, params):
    num_in = layers.Input(shape=(num_numeric_features,))
    item_in = layers.Input(shape=(), dtype="int32")

    emb = layers.Embedding(
        input_dim=num_items,
        output_dim=params["embedding_dim"]
    )(item_in)

    x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

    for units in params["hiddenLayers"]:
        x = layers.Dense(units, activation="relu")(x)

    out = layers.Dense(
        1,
        activation=params.get("output_activation", "sigmoid")
    )(x)

    model = models.Model([num_in, item_in], out)

    model.compile(
        optimizer=params.get("optimizer", "adam"),
        loss=params.get("loss", "mse"),
        metrics=params.get("metrics", ["mae"])
    )

    return model
##########################################################################################

In [None]:


params = {
    "embedding_dim": 64,
    "hiddenLayers": [4096],
    "loss": "mse",
    "optimizer": "adam",
    "metrics": ["mae"],
    "output_activation": "sigmoid"
}


## 
model = build_and_compile_model(num_numeric_features, num_items, params)

save_experiment( model, history, predictions, params, numeric_cols, item_id_to_idx, base_dir="experiments"):

In [None]:
# ============================================================
# DATE-AWARE PREDICTION (TIGHTER PARAMS + ITEM NAMES)
# ============================================================

import numpy as np
import pandas as pd

def run_predictions( model, encoded_df, combined_df, feature_stats, birthdays, predict_date=None):
    """
    Build one prediction row per item using:
    - latest encoded feature state (encoded_df)
    - raw timeline + names (combined_df)
    - recomputed calendar features at predict_date
    """

    if predict_date is None:
        predict_date = pd.Timestamp.today().normalize()
    else:
        predict_date = pd.to_datetime(predict_date).normalize()

    # --------------------------------------------------------
    # Discover numeric features (single source: encoded_df)
    # --------------------------------------------------------
    numeric_cols = [
        c for c in encoded_df.columns
        if c.endswith("_norm") and c != "due_score"
    ]

    # --------------------------------------------------------
    # Lookups from combined_df (single source of truth)
    # --------------------------------------------------------
    last_date_by_item = (
        combined_df
        .sort_values("date")
        .groupby("itemId")["date"]
        .last()
    )

    item_lookup = (
        combined_df[["itemId", "item"]]
        .drop_duplicates()
        .set_index("itemId")["item"]
        .to_dict()
    )

    rows = []

    for itemId, hist in encoded_df.groupby("itemId"):
        last = hist.iloc[-1]
        last_date = pd.to_datetime(last_date_by_item.loc[itemId]).normalize()

        row = {
            "itemId": itemId,
            "item": item_lookup.get(itemId, "UNKNOWN"),
            "itemIdx": int(last["itemIdx"])
        }

        # ----------------------------------------------------
        # Copy model-stable numeric features (already normalized)
        # ----------------------------------------------------
        for col in numeric_cols:
            row[col] = last[col]

        # ----------------------------------------------------
        # Recompute DATE-SENSITIVE features
        # ----------------------------------------------------
        raw_updates = {
            "daysSinceLastPurchase": (predict_date - last_date).days,
            "daysUntilNextHoliday": daysUntilNextHoliday(predict_date),
            "daysSinceLastHoliday": daysSinceLastHoliday(predict_date),
            "holidayProximityIndex": holidayProximityIndex(predict_date),
            "daysUntilSchoolStart": daysUntilSchoolStart(predict_date),
            "daysUntilSchoolEnd": daysUntilSchoolEnd(predict_date),
            "schoolSeasonIndex": schoolSeasonIndex(predict_date),
            "year": predict_date.year,
            "day": predict_date.day,
            "quarter": predict_date.quarter
        }

        # birthdays
        for name, bday in birthdays.items():
            raw_updates[f"daysUntilBirthday_{name}"] = daysUntilBirthday(predict_date, bday)
            raw_updates[f"daysSinceBirthday_{name}"] = daysSinceBirthday(predict_date, bday)

        # ----------------------------------------------------
        # Normalize recomputed features
        # ----------------------------------------------------
        for raw, val in raw_updates.items():
            norm_col = raw + "_norm"
            if norm_col in numeric_cols and raw in feature_stats:
                stats = feature_stats[raw]
                row[norm_col] = (val - stats["mean"]) / stats["std"]

        rows.append(row)

    pred_df = pd.DataFrame(rows)

    Xn = pred_df[numeric_cols].to_numpy(np.float32)
    Xi = pred_df["itemIdx"].to_numpy(np.int32)

    scores = model.predict([Xn, Xi], verbose=0).ravel()

    pred_df["due_intensity"] = scores

    return (
        pred_df[["itemId", "item", "due_intensity"]]
        .sort_values("due_intensity", ascending=False)
        .reset_index(drop=True)
    )
###############################################################################

# ============================================================
# TRAIN + PREDICT + SAVE (AUDITABLE, FINAL) — UPDATED
# ============================================================

import os
import json
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from datetime import datetime

tf.keras.backend.clear_session()


# ------------------------------------------------------------
# ENSURE itemIdx
# ------------------------------------------------------------
item_ids = sorted(encoded_df["itemId"].unique())
item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
NUM_ITEMS = len(item_ids)

# ------------------------------------------------------------
# FEATURES / TARGET
# ------------------------------------------------------------
numeric_cols = [
    c for c in encoded_df.columns
    if c.endswith("_norm") and c != "due_score"
]

Xn = encoded_df[numeric_cols].to_numpy(np.float32)
Xi = encoded_df["itemIdx"].to_numpy(np.int32)
y  = encoded_df["due_score"].to_numpy(np.float32)

# ------------------------------------------------------------
# SPLIT
# ------------------------------------------------------------
Xn_tr, Xn_te, Xi_tr, Xi_te, y_tr, y_te = train_test_split(
    Xn, Xi, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------
# MODEL
# ------------------------------------------------------------
num_in = layers.Input(shape=(Xn_tr.shape[1],))
itm_in = layers.Input(shape=(), dtype="int32")

emb = layers.Embedding(NUM_ITEMS, 64)(itm_in)
emb = layers.Flatten()(emb)

x = layers.Concatenate()([num_in, emb])
x = layers.Dense(4096, activation="relu")(x)
#x = layers.Dense(2048, activation="relu")(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = models.Model([num_in, itm_in], out)
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss="mse", metrics=["mae"])

history = model.fit(
    [Xn_tr, Xi_tr],
    y_tr,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    verbose=1
)

# ------------------------------------------------------------
# FEATURE STATS (ONLY recomputed features)
# (NOTE: stats are for *_norm columns since inference writes *_norm)
# ------------------------------------------------------------
feature_stats = {}
RECOMPUTED = [
    "daysSinceLastPurchase",
    "daysUntilNextHoliday",
    "daysSinceLastHoliday",
    "holidayProximityIndex",
    "daysUntilSchoolStart",
    "daysUntilSchoolEnd",
    "schoolSeasonIndex",
    "year", "day", "quarter",
    "daysUntilBirthday_steve", "daysSinceBirthday_steve",
    "daysUntilBirthday_maggie", "daysSinceBirthday_maggie",
    "daysUntilBirthday_mil", "daysSinceBirthday_mil",
    "daysUntilBirthday_angie", "daysSinceBirthday_angie",
]

for raw in RECOMPUTED:
    col = raw + "_norm"
    if col in encoded_df.columns:
        std = encoded_df[col].std()
        feature_stats[raw] = {
            "mean": encoded_df[col].mean(),
            "std": std if std != 0 else 1.0
        }

# ------------------------------------------------------------
# BIRTHDAYS
# ------------------------------------------------------------
BIRTHDAYS = { "steve":  "03-05-1980", "maggie": "03-03-2016","mil": "01-27-1962", "angie":  "08-11-1981"}
birthdays = {k: pd.to_datetime(v) for k, v in BIRTHDAYS.items()}

# ------------------------------------------------------------
# PREDICT (UPDATED CALL)
# ------------------------------------------------------------
predictions = run_predictions(model=model, encoded_df=encoded_df, combined_df=combined_df, feature_stats=feature_stats, birthdays=birthdays, predict_date=None)

# ------------------------------------------------------------
# SAVE
# ------------------------------------------------------------
save_experiment( model=model, history=history, predictions=predictions, params={}, numeric_cols=numeric_cols, item_id_to_idx=item_id_to_idx)

predictions.head(50)


In [None]:
# import os
# import json
# import multiprocessing as mp
# from datetime import datetime
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras import layers, models, callbacks


# # ============================================================
# # EXPERIMENT CONFIG
# # ============================================================

# # RECOMMENDATION:
# # Keep embedding sizes small. If 16 shows no benefit over 8, stop.
# embedding_sizes = [4, 8, 16]

# # RECOMMENDATION:
# # Grocery behavior is shallow. 2 layers usually wins.
# architectures = [
#     [32, 16],
#     [64, 32],
#     [128, 64]
# ]

# # RECOMMENDATION:
# # Use ReLU unless you observe dead neurons.
# activations = ["relu"]

# # RECOMMENDATION:
# # Sigmoid for single-item probability.
# output_activation = "sigmoid"

# # RECOMMENDATION:
# # Epoch count is NOT a design decision. Early stopping decides.
# max_epochs = 50

# base_experiment_dir = "experiments"



# # ============================================================
# # SINGLE EXPERIMENT RUN
# # ============================================================

# def run_experiment(config):
#     (
#         embedding_dim,
#         dense_layers,
#         activation,
#         X_train_num,
#         X_val_num,
#         X_train_item,
#         X_val_item,
#         y_train,
#         y_val,
#         num_items
#     ) = config

#     exp_name = (
#         f"emb{embedding_dim}"
#         f"_layers{'-'.join(map(str, dense_layers))}"
#         f"_act{activation}"
#         f"_epochs{max_epochs}"
#     )

#     exp_dir = os.path.join(base_experiment_dir, exp_name)
#     os.makedirs(exp_dir, exist_ok=True)

#     model = build_model(
#         num_numeric_features=X_train_num.shape[1],
#         num_items=num_items,
#         embedding_dim=embedding_dim,
#         dense_layers=dense_layers,
#         activation=activation
#     )

#     early_stop = callbacks.EarlyStopping(
#         monitor="val_loss",
#         patience=5,
#         restore_best_weights=True
#     )

#     history = model.fit(
#         [X_train_num, X_train_item],
#         y_train,
#         validation_data=([X_val_num, X_val_item], y_val),
#         epochs=max_epochs,
#         callbacks=[early_stop],
#         verbose=0
#     )

#     # ============================================================
#     # SAVE EVERYTHING NEEDED FOR FORENSICS OR REUSE
#     # ============================================================

#     model.save(os.path.join(exp_dir, "model"))
#     model.save_weights(os.path.join(exp_dir, "weights.h5"))

#     with open(os.path.join(exp_dir, "history.json"), "w") as f:
#         json.dump(history.history, f, indent=2)

#     with open(os.path.join(exp_dir, "hyperparams.json"), "w") as f:
#         json.dump(
#             {
#                 "embedding_dim": embedding_dim,
#                 "dense_layers": dense_layers,
#                 "activation": activation,
#                 "output_activation": output_activation,
#                 "max_epochs": max_epochs
#             },
#             f,
#             indent=2
#         )

#     with open(os.path.join(exp_dir, "summary.txt"), "w") as f:
#         model.summary(print_fn=lambda x: f.write(x + "\n"))

#     return exp_name


# # ============================================================
# # PARALLEL ORCHESTRATION
# # ============================================================

# def run_all_experiments(
#     X_train_num,
#     X_val_num,
#     X_train_item,
#     X_val_item,
#     y_train,
#     y_val,
#     num_items
# ):
#     os.makedirs(base_experiment_dir, exist_ok=True)

#     jobs = []

#     for embedding_dim in embedding_sizes:
#         for dense_layers in architectures:
#             for activation in activations:
#                 jobs.append(
#                     (
#                         embedding_dim,
#                         dense_layers,
#                         activation,
#                         X_train_num,
#                         X_val_num,
#                         X_train_item,
#                         X_val_item,
#                         y_train,
#                         y_val,
#                         num_items
#                     )
#                 )

#     # RECOMMENDATION:
#     # Use processes, not threads. TensorFlow releases GIL poorly.
#     with mp.Pool(processes=mp.cpu_count()) as pool:
#         results = pool.map(run_experiment, jobs)

#     return results


# # ============================================================
# # ENTRY POINT
# # ============================================================

# # Example call (you already have these objects):
# # run_all_experiments(
# #     X_train_num,
# #     X_val_num,
# #     X_train_item,
# #     X_val_item,
# #     y_train,
# #     y_val,
# #     num_items
# # )
