In [1]:


import pandas as pd
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import os
from datetime import datetime
import asyncio
import json

import gc
import tensorflow as tf
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from temporal_features import TemporalFeatures
from holiday_features import HolidayFeatures
from wallmart_rcpt_parser import WallmartRecptParser
from winn_dixie_recpt_parser import WinnDixieRecptParser 
from hidden_layer_param_builder import HiddenLayerParamSetBuilder
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

C:\Users\steve\source\repos\grocery-ml
GPUs Available: []


In [2]:

def export_df_to_excel_table(df, file_path, sheet_name="Data"):
    """
    Export a pandas DataFrame to an Excel file as a proper Excel Table
    with no duplicated header rows.
    """
    from openpyxl import load_workbook
    from openpyxl.worksheet.table import Table, TableStyleInfo

    df.to_excel(file_path, sheet_name=sheet_name, index=False)

    workbook = load_workbook(file_path)
    worksheet = workbook[sheet_name]

    end_row = worksheet.max_row
    end_col = worksheet.max_column
    end_col_letter = worksheet.cell(row=1, column=end_col).column_letter

    table_ref = f"A1:{end_col_letter}{end_row}"
    table = Table(displayName="DataTable", ref=table_ref)

    style = TableStyleInfo(
        name="TableStyleMedium9",
        showFirstColumn=False,
        showLastColumn=False,
        showRowStripes=True,
        showColumnStripes=False
    )

    table.tableStyleInfo = style
    worksheet.add_table(table)

    workbook.save(file_path)

###########################################################################################




def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)



def canonicalize_items(df, patterns, canonical_name):
    """
    For each pattern in `patterns`, find rows where `item` contains the pattern
    and replace df['item'] with `canonical_name`.
    """
    for p in patterns:
        mask = df["item"].str.contains(p, case=False, na=False)
        df.loc[mask, "item"] = canonical_name


In [3]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")


In [4]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"]
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

winndixie_df = pd.DataFrame(rows)

winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])
winndixie_df["time"] = winndixie_df["time"].astype(str)

winndixie_df = WinnDixieRecptParser.remove_duplicate_receipt_files(winndixie_df)

winndixie_df = winndixie_df.sort_values(by=["date", "time"]).reset_index(drop=True)
winndixie_df = winndixie_df.drop(columns=["time"])

DUP: 2025-08-02 00:00:00 10:29 PM → keep IMG_9693.txt ← drop IMG_9694.txt
DUP: 2025-10-07 00:00:00 6:06 PM → keep IMG_0017.txt ← drop IMG_9669.txt
DUP: 2025-10-14 00:00:00 4:06 PM → keep IMG_0015.txt ← drop IMG_9667.txt
DUP: 2025-10-14 00:00:00 6:08 PM → keep IMG_0014.txt ← drop IMG_9666.txt
DUP: 2025-10-17 00:00:00 9:18 PM → keep IMG_0013.txt ← drop IMG_9664.txt


In [5]:

wallmart_raw = WallmartRecptParser.ImportWallMart("./walmart")

## rename cols
wallmart_df = wallmart_raw[["Order Date","Product Description", "source"]].copy()
wallmart_df = wallmart_df.rename(columns={
    "Order Date": "date",
    "Product Description": "item"
})

wallmart_df["date"] = pd.to_datetime(wallmart_df["date"])
winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])

combined_df = pd.concat(
    [winndixie_df, wallmart_df[["date", "item", "source"]]],
    ignore_index=True
)

# remove - 
combined_df["item"] = (combined_df["item"]
        .str.replace(r"^\s*[-–—]\s*", "", regex=True)
        .str.strip()
)



In [6]:
milk_patterns = ["know-and-love-milk", "kandl-milk", "prairie-farm-milk","kleinpeter-milk", "kl-milk", "Milk, Fat Free,", "Fat-Free Milk"]
canonicalize_items(combined_df, milk_patterns, "milk")

bread_patterns = ["bunny-bread","se-grocers-bread","seg-sandwich-bread", "seg-white-bread"]
canonicalize_items(combined_df, bread_patterns, "bread")

cheese_patterns = ["dandw-cheese", "kraft-cheese", "se-grocers-cheese", "know-and-love-cheese"]
canonicalize_items(combined_df, cheese_patterns, "cheese")

mayo_patterns = ["blue-plate-mayo", "blue-plate-mynnase"]
canonicalize_items(combined_df, mayo_patterns, "mayo")

chicken_patterns = ["chicken-cutlet", "chicken-leg", "chicken-thigh", "chicken-thighs"]
canonicalize_items(combined_df, chicken_patterns, "chicken")

yogurt_patterns = ["chobani-yogrt-flip", "chobani-yogurt"]
canonicalize_items(combined_df, yogurt_patterns, "yogurt")

coke_patterns = ["coca-cola", "coca-cola-cola", "cocacola-soda", "coke", "cola"]
canonicalize_items(combined_df, coke_patterns, "coke")

hugbi_patterns = ["hugbi-pies", "-hugbi-pies"]
canonicalize_items(combined_df, hugbi_patterns, "hugbi-pies")

ceralPaterns  = ["ceral"]
canonicalize_items(combined_df, ceralPaterns, "ceral")

minute_maid_patterns = ["minute-maid-drink", "minute-maid-drinks", "minute-maid-lmnade"]
canonicalize_items(combined_df, minute_maid_patterns, "minute-maid-drink")

eggs_pattern = ["egglands-best-egg", "egglands-best-eggs", "eggs"]
canonicalize_items(combined_df, eggs_pattern, "eggs")



In [7]:
### CREATE ITEM IDs
unique_items = sorted(combined_df["item"].unique())
item_to_id = {item: idx for idx, item in enumerate(unique_items)}
id_to_item = {idx: item for item, idx in item_to_id.items()}
combined_df["itemId"] = combined_df["item"].map(item_to_id)
combined_df.reset_index(drop=True, inplace=True)
combined_df.info()
combined_df.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   source  1436 non-null   object        
 1   date    1436 non-null   datetime64[ns]
 2   item    1436 non-null   object        
 3   itemId  1436 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 45.0+ KB


Unnamed: 0,source,date,item,itemId
0,IMG_9764.txt,2024-11-15,spaghettios-pasta,695
1,IMG_9764.txt,2024-11-15,coke,439
2,IMG_9764.txt,2024-11-15,mandms-candies,548
3,IMG_9764.txt,2024-11-15,mandarins,547
4,IMG_9764.txt,2024-11-15,sparkling-ice-wtr,696
5,IMG_9764.txt,2024-11-15,sparkling-ice-wtr,696
6,IMG_9764.txt,2024-11-15,coke,439
7,IMG_9763.txt,2024-11-22,hersheys-syrup,491
8,IMG_9763.txt,2024-11-22,mayfield-icecream,551
9,IMG_9762.txt,2024-11-24,bread,411


In [8]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
combined_df["didBuy"] = 1

# 2. Build complete grid
all_items = combined_df["itemId"].unique()
all_dates = combined_df["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    combined_df[["date", "itemId", "item", "source", "didBuy"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy"] = df_full["didBuy"].fillna(0).astype(int)

# 5. NOW REPLACE combined_df with df_full
combined_df = df_full.copy()

In [9]:
# 1. Build grouped table (one row per trip date)

grouped = ( combined_df[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)



grouped["daysSinceLastTrip"] = TemporalFeatures.DaysSinceLastTrip(grouped)
grouped["avgDaysBetweenTrips"] = TemporalFeatures.AvgDaysBetweenTrips(grouped)

# 3. Holiday / School features
grouped["daysUntilNextHoliday"] = grouped["date"].apply(HolidayFeatures.daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(HolidayFeatures.daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(HolidayFeatures.holidayProximityIndex)
grouped["daysUntilSchoolStart"] = grouped["date"].apply(HolidayFeatures.daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(HolidayFeatures.daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(HolidayFeatures.schoolSeasonIndex)


grouped = TemporalFeatures.CreateDateFeatures(grouped)

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

combined_df = combined_df.merge(grouped, on="date", how="left")
combined_df.info()
combined_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   date                   121240 non-null  datetime64[ns]
 1   itemId                 121240 non-null  int64         
 2   item                   1436 non-null    object        
 3   source                 1436 non-null    object        
 4   didBuy                 121240 non-null  int32         
 5   daysSinceLastTrip      121240 non-null  float64       
 6   avgDaysBetweenTrips    121240 non-null  float64       
 7   daysUntilNextHoliday   121240 non-null  int64         
 8   daysSinceLastHoliday   121240 non-null  int64         
 9   holidayProximityIndex  121240 non-null  float64       
 10  daysUntilSchoolStart   121240 non-null  int64         
 11  daysUntilSchoolEnd     121240 non-null  int64         
 12  schoolSeasonIndex      121240 non-null  floa

Unnamed: 0,date,itemId,item,source,didBuy,daysSinceLastTrip,avgDaysBetweenTrips,daysUntilNextHoliday,daysSinceLastHoliday,holidayProximityIndex,daysUntilSchoolStart,daysUntilSchoolEnd,schoolSeasonIndex,year,month,day,dow,doy,quarter,temp_5day_avg,feelsLike_5day_avg,dew_5day_avg,humidity_5day_avg,precip_5day_avg
0,2024-11-15,695,spaghettios-pasta,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
1,2024-11-15,439,coke,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
2,2024-11-15,439,coke,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
3,2024-11-15,548,mandms-candies,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
4,2024-11-15,547,mandarins,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
5,2024-11-15,696,sparkling-ice-wtr,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
6,2024-11-15,696,sparkling-ice-wtr,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
7,2024-11-15,491,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
8,2024-11-15,551,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
9,2024-11-15,411,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162


In [10]:
 def fill_freq(group):
        group = group.copy()
        group = group.sort_values("date").reset_index(drop=True)
    
        history = []
    
        col_date = group.columns.get_loc("date")
        col_buy = group.columns.get_loc("didBuy")
        col_freq = {w: group.columns.get_loc(f"freq_{w}") for w in freq_windows}
    
        for i in range(len(group)):
            cur_date = group.iat[i, col_date]
    
            # record purchase
            if group.iat[i, col_buy] == 1:
                history.append(cur_date)
    
            # prune history ONCE using largest window
            cutoff_max = cur_date - pd.Timedelta(days=max_w)
            history = [d for d in history if d >= cutoff_max]
    
            # compute windowed counts
            for w in freq_windows:
                cutoff = cur_date - pd.Timedelta(days=w)
                count = 0
                for d in history:
                    if d >= cutoff:
                        count += 1
                group.iat[i, col_freq[w]] = count
    
        return group
####################################################################

freq_windows = [7, 15, 30, 90, 365]
max_w = max(freq_windows)

# initialize columns
for w in freq_windows:
    combined_df[f"freq_{w}"] = np.nan

combined_df = (
    combined_df
    .groupby("itemId", group_keys=False)
    .apply(fill_freq)
)


  .apply(fill_freq)


In [12]:
combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

combined_df["daysSinceLastPurchase"] = np.nan

# Track last purchase date per item
last_purchase_date = {}

for i in range(len(combined_df)):
    itemId = combined_df.at[i, "itemId"]
    current_date = combined_df.at[i, "date"]

    if itemId in last_purchase_date:
        combined_df.at[i, "daysSinceLastPurchase"] = (
            current_date - last_purchase_date[itemId]
        ).days
    else:
        combined_df.at[i, "daysSinceLastPurchase"] = 0  # or 999 if you prefer

    if combined_df.at[i, "didBuy"] == 1:
        last_purchase_date[itemId] = current_date


####################

combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

# Purchase-to-purchase gaps
purchase_gap = (
    combined_df
        .where(combined_df["didBuy"] == 1)
        .groupby("itemId")["date"]
        .diff()
        .dt.days
)

# Expanding (lifetime-so-far) average per item
avg_gap = (
    purchase_gap
        .groupby(combined_df["itemId"])
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
)

# Attach + forward-fill so ALL rows have a value
combined_df["avgDaysBetweenPurchases"] = (
    avg_gap
        .groupby(combined_df["itemId"])
        .ffill()
        .fillna(0)
)

export_df_to_excel_table(combined_df, "combined_df.xlsx", "combined_df") 
##combined_df.to_csv("combined_df.csv")

In [16]:
# # ============================================================
# # INCREASING DAILY daysSinceLastPurchase (resets on purchase)
# # ============================================================
# def fill_item(group):
#     group = group.copy()
#     # iterate row-by-row using positional index
#     for i in range(1, len(group)):
#         if pd.isna(group.iat[i, group.columns.get_loc("daysSinceLastPurchase")]):
#             prev_val = group.iat[i-1, group.columns.get_loc("daysSinceLastPurchase")]
#             trip_gap = group.iat[i, group.columns.get_loc("daysSinceLastTrip")]
#             group.iat[i, group.columns.get_loc("daysSinceLastPurchase")] = prev_val + trip_gap
#     return group
# ##########################################################################################

# combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

# # Start with NaN everywhere
# combined_df["daysSinceLastPurchase"] = np.nan

# # Set 0 on purchase days
# combined_df.loc[combined_df["didBuy"] == 1, "daysSinceLastPurchase"] = 0
# combined_df = combined_df.groupby("itemId", group_keys=False).apply(fill_item)

# # Items with no purchase history get 999
# combined_df["daysSinceLastPurchase"] = combined_df["daysSinceLastPurchase"].fillna(999)
# combined_df.to_csv("daysSinceLastPurchase.csv", index=False)

In [17]:
# ============================================================
# ITEM-LEVEL HABIT FEATURES (TF-IDF ANALOG)
# ============================================================
import numpy as np
import pandas as pd

def build_habit_features(df, tau_days=120):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    total_trips = df["date"].nunique()
    timeline_days = (df["date"].max() - df["date"].min()).days or 1

    rows = []

    for itemId, g in df.groupby("itemId"):
        buys = g[g["didBuy"] == 1]["date"]

        if len(buys) == 0:
            rows.append({
                "itemId": itemId,
                "habitFrequency": 0.0,
                "habitSpan": 0.0,
                "habitDecay": 0.0,
            })
            continue

        first = buys.min()
        last = buys.max()

        habitFrequency = len(buys) / total_trips
        habitSpan = (last - first).days / timeline_days
        days_since_last = (df["date"].max() - last).days
        habitDecay = np.exp(-days_since_last / tau_days)

        rows.append({
            "itemId": itemId,
            "habitFrequency": habitFrequency,
            "habitSpan": habitSpan,
            "habitDecay": habitDecay,
        })

    return pd.DataFrame(rows)
###############################################################################


def compute_due_score(df,itemId=None,use_sigmoid=True,normalize=False, weights=None):
    """
    Compute due_score from RAW (non-normalized) features.

    Required columns:
      - itemId
      - daysSinceLastPurchase
      - freq_30
      - freq_90

    Parameters
    ----------
    df : DataFrame

    itemId : int | None
        If provided, compute only for this itemId.
        If None, compute for all items.

    use_sigmoid : bool
        Apply sigmoid → (0,1)

    normalize : bool
        Z-normalize instead (ignored if use_sigmoid=True)

    weights : dict | None
        Optional override for feature weights
    """

    if weights is None:
        weights = {
            "daysSinceLastPurchase": 1.5,
            "freq_30": 1.0,
            "freq_90": 0.5
        }

    # --------------------------------------------------------
    # Optional itemId filter
    # --------------------------------------------------------
    if itemId is not None:
        df = df[df["itemId"] == itemId].copy()
    else:
        df = df.copy()

    # --------------------------------------------------------
    # RAW linear score (pre-normalization)
    # --------------------------------------------------------
    df["due_score_raw"] = (
        weights["daysSinceLastPurchase"] * df["daysSinceLastPurchase"]
      + weights["freq_30"]              * df["freq_30"]
      + weights["freq_90"]              * df["freq_90"]
    )

    # --------------------------------------------------------
    # Final due_score
    # --------------------------------------------------------
    if use_sigmoid:
        df["due_score"] = 1 / (1 + np.exp(-df["due_score_raw"]))

    elif normalize:
        mean = df["due_score_raw"].mean()
        std  = df["due_score_raw"].std() or 1.0
        df["due_score"] = (df["due_score_raw"] - mean) / std

    else:
        df["due_score"] = df["due_score_raw"]

    return df
###############################################################################


# ============================================================
# MERGE HABIT FEATURES
# ============================================================
habit_df = build_habit_features(combined_df)

combined_df = combined_df.merge(habit_df, on="itemId",how="left")

combined_df[["habitFrequency", "habitSpan", "habitDecay"]] = (
    combined_df[["habitFrequency", "habitSpan", "habitDecay"]].fillna(0.0)
)



In [22]:
def compute_due_ratio(df, cap=3.0):
    ratio = df["daysSinceLastPurchase"] / df["avgDaysBetweenPurchases"]
    ratio = ratio.replace([np.inf, -np.inf], np.nan).fillna(0)
    return ratio.clip(0, cap)
###############################################################################

combined_df["item_due_ratio"] = compute_due_ratio(combined_df)

combined_df["purchaseToTripRatio"] = combined_df["daysSinceLastPurchase"] / combined_df["avgDaysBetweenPurchases"]




In [23]:
# ============================================================
# NORMALIZE TO ENCODED_DF
# ============================================================

freq_cols = [c for c in combined_df.columns if c.startswith("freq_")]
weather_cols = [c for c in combined_df.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in combined_df.columns if "holiday" in c.lower()]
school_cols = [c for c in combined_df.columns if "school" in c.lower()]

daysSince_purchase_cols = [c for c in combined_df.columns if "days" in c.lower() and "purchase" in c.lower()]
daysSince_trip_cols     = [c for c in combined_df.columns if "days" in c.lower() and "trip" in c.lower()]

days_cols = daysSince_purchase_cols + daysSince_trip_cols


habit_cols = ["habitFrequency", "habitSpan", "habitDecay"]

encoded_df = combined_df.copy()

encoded_df = normalizeAndDropCols(encoded_df, ["item_due_ratio"])
encoded_df = normalizeAndDropCols(encoded_df, freq_cols)
encoded_df = normalizeAndDropCols(encoded_df, weather_cols)
encoded_df = normalizeAndDropCols(encoded_df, holiday_cols)
encoded_df = normalizeAndDropCols(encoded_df, school_cols)
encoded_df = normalizeAndDropCols(encoded_df, days_cols)
encoded_df = normalizeAndDropCols(encoded_df, habit_cols)

encoded_df.info()
encoded_df.head(100)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 36 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   date                          121240 non-null  datetime64[ns]
 1   itemId                        121240 non-null  int64         
 2   item                          1436 non-null    object        
 3   source                        1436 non-null    object        
 4   didBuy                        121240 non-null  int32         
 5   year                          121240 non-null  int32         
 6   month                         121240 non-null  int32         
 7   day                           121240 non-null  int32         
 8   dow                           121240 non-null  int32         
 9   doy                           121240 non-null  int32         
 10  quarter                       121240 non-null  int32         
 11  purchaseToTri

Unnamed: 0,date,itemId,item,source,didBuy,year,month,day,dow,doy,quarter,purchaseToTripRatio,item_due_ratio_norm,freq_7_norm,freq_15_norm,freq_30_norm,freq_90_norm,freq_365_norm,temp_5day_avg_norm,feelsLike_5day_avg_norm,dew_5day_avg_norm,humidity_5day_avg_norm,precip_5day_avg_norm,daysUntilNextHoliday_norm,daysSinceLastHoliday_norm,holidayProximityIndex_norm,daysUntilSchoolStart_norm,daysUntilSchoolEnd_norm,schoolSeasonIndex_norm,daysSinceLastPurchase_norm,avgDaysBetweenPurchases_norm,daysSinceLastTrip_norm,avgDaysBetweenTrips_norm,habitFrequency_norm,habitSpan_norm,habitDecay_norm
0,2023-01-13,0,,,0,2023,1,13,4,13,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,-0.966915,-0.948081,-0.892273,-0.018198,-0.745327,-0.929893,-0.504067,1.273771,0.238735,-0.585512,-1.501621,-0.661973,-0.202883,-0.613197,-2.856321,-0.202605,-0.395637,-0.933647
1,2023-01-25,0,,,0,2023,1,25,2,25,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,-1.554575,-1.427519,-1.340791,0.261472,-0.504926,0.094047,-0.60389,-0.973518,0.125935,-0.707701,-1.416669,-0.661973,-0.202883,0.514116,0.228435,-0.202605,-0.395637,-0.933647
2,2023-02-17,0,,,0,2023,2,17,4,48,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,-0.878942,-0.9,-0.634036,0.636885,-0.401612,-0.929893,0.544073,1.273771,-0.090266,-0.941897,-1.253845,-0.661973,-0.202883,1.547486,1.642281,-0.202605,-0.395637,-0.933647
3,2023-03-01,0,,,0,2023,3,1,2,60,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.206644,0.038268,0.432893,0.851047,-0.745327,2.898753,-0.60389,-0.973518,-0.203067,-1.064086,-1.168894,-0.661973,-0.202883,0.514116,1.170999,-0.202605,-0.395637,-0.933647
4,2023-03-02,0,,,0,2023,3,2,3,61,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.248871,0.07948,0.468571,0.803176,-0.745327,2.854233,-0.553978,-0.9267,-0.212467,-1.074269,-1.161815,-0.661973,-0.202883,-0.519255,0.228435,-0.202605,-0.395637,-0.933647
5,2023-03-04,0,,,0,2023,3,4,5,63,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.079963,-0.056521,0.07272,-0.038355,-0.733407,2.765195,-0.454155,-0.833063,-0.231267,-1.094633,-1.147656,-0.661973,-0.202883,-0.425312,-0.285691,-0.202605,-0.395637,-0.933647
6,2023-03-09,0,,,0,2023,3,9,3,68,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.099317,-0.038662,0.254505,0.583975,-0.745327,2.5426,-0.204598,-0.59897,-0.278267,-1.145546,-1.11226,-0.661973,-0.202883,-0.143484,-0.49991,-0.202605,-0.395637,-0.933647
7,2023-03-17,0,,,0,2023,3,17,4,76,1,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,-1.088318,-1.015395,-1.497093,-1.681103,0.43284,2.186446,0.194693,-0.224422,-0.353467,-1.227005,-1.055625,-0.661973,-0.202883,0.138345,-0.542754,-0.202605,-0.395637,-0.933647
8,2023-04-07,0,,,0,2023,4,7,4,97,2,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.435373,0.285542,0.720012,1.055131,-0.57049,1.251545,1.242832,0.009671,-0.550868,-1.440836,-0.90696,-0.661973,-0.202883,1.359601,-0.157159,-0.202605,-0.395637,-0.933647
9,2023-05-27,0,,,0,2023,5,27,5,147,2,,-0.23653,-0.160067,-0.194098,-0.232129,-0.287334,-0.301619,0.563814,0.31439,0.115193,-1.424108,-0.694664,-0.974412,3.738402,1.32059,-1.02087,-1.949957,-0.552995,-0.661973,-0.202883,4.083941,0.971061,-0.202605,-0.395637,-0.933647


In [24]:
# ---------- CYCLICAL FEATURES ----------
# encoded_df["dow_sin"]   = np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
# encoded_df["dow_cos"]   = np.cos(2 * np.pi * encoded_df["dow"] / 7.0)
# encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
# encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)
# encoded_df["doy_sin"]   = np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
# encoded_df["doy_cos"]   = np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df["dow_sin"], encoded_df["dow_cos"] = TemporalFeatures.encode_sin_cos( encoded_df["dow"], 7.0)
encoded_df["month_sin"], encoded_df["month_cos"] = TemporalFeatures.encode_sin_cos(encoded_df["month"], 12.0)
encoded_df["doy_sin"], encoded_df["doy_cos"] = TemporalFeatures.encode_sin_cos(encoded_df["doy"], 365.0)

encoded_df = encoded_df.drop(columns=["dow", "month", "doy"], errors="ignore")

# ---------- NON-CYCLIC TIME FEATURES ----------
nonCycCols = ["year", "day", "quarter"]
encoded_df = normalizeAndDropCols(encoded_df, nonCycCols)

# ---------- DROP NON-MODEL COLS ----------
cols_to_drop = ["source", "item", "date"]
encoded_df = encoded_df.drop(columns=cols_to_drop, errors="ignore")

encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 36 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   itemId                        121240 non-null  int64  
 1   didBuy                        121240 non-null  int32  
 2   purchaseToTripRatio           68196 non-null   float64
 3   item_due_ratio_norm           121240 non-null  float64
 4   freq_7_norm                   121240 non-null  float64
 5   freq_15_norm                  121240 non-null  float64
 6   freq_30_norm                  121240 non-null  float64
 7   freq_90_norm                  121240 non-null  float64
 8   freq_365_norm                 121240 non-null  float64
 9   temp_5day_avg_norm            121240 non-null  float64
 10  feelsLike_5day_avg_norm       121240 non-null  float64
 11  dew_5day_avg_norm             121240 non-null  float64
 12  humidity_5day_avg_norm        121240 non-nul

In [25]:
encoded_df["due_score"] = (
    1.5 * encoded_df["daysSinceLastPurchase_norm"]
  + 1.0 * encoded_df["freq_30_norm"]
  ##+ 0.5 * encoded_df["freq_90_norm"]
)

####encoded_df["due_score"] = 1 / (1 + np.exp(-encoded_df["due_score"]))

encoded_df.info()

export_df_to_excel_table(encoded_df, "encoded_df.xlsx", "encoded_df") 
#encoded_df.to_csv("encoded.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   itemId                        121240 non-null  int64  
 1   didBuy                        121240 non-null  int32  
 2   purchaseToTripRatio           68196 non-null   float64
 3   item_due_ratio_norm           121240 non-null  float64
 4   freq_7_norm                   121240 non-null  float64
 5   freq_15_norm                  121240 non-null  float64
 6   freq_30_norm                  121240 non-null  float64
 7   freq_90_norm                  121240 non-null  float64
 8   freq_365_norm                 121240 non-null  float64
 9   temp_5day_avg_norm            121240 non-null  float64
 10  feelsLike_5day_avg_norm       121240 non-null  float64
 11  dew_5day_avg_norm             121240 non-null  float64
 12  humidity_5day_avg_norm        121240 non-nul

# TRAIN / BUILD MODEL

In [30]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

###############################################

def export_df(dataframes, dir):
    for name, df in dataframes.items():
        csv_path = os.path.join(dir, f"{name}.csv")
        xlsxPath = os.path.join(dir, f"{name}.xlsx")
        df.to_csv(csv_path, index=True)
        export_df_to_excel_table(df, xlsxPath, sheet_name=f"{name}")


###############################################

def save_experiment(model, history, dataframes, build_params, train_params, feature_cols, item_id_to_idx, base_dir):
    name_parts = []

    if "embedding_dim" in build_params:
        name_parts.append(f"emb{build_params['embedding_dim']}")

    if "layers" in build_params:
        hl = "-".join(str(x) for x in build_params["layers"])
        name_parts.append(f"hl{hl}")

    if "epochs" in train_params:
        name_parts.append(f"ep{train_params['epochs']}")

    exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
    exp_dir = os.path.join(base_dir, exp_name)
    os.makedirs(exp_dir, exist_ok=True)

    export_df(dataframes, exp_dir)

    model.save(os.path.join(exp_dir, "model"))
    model.save_weights(os.path.join(exp_dir, "weights.h5"))

    history_path = os.path.join(exp_dir, "history.json")
    history_file = open(history_path, "w")
    json.dump(history.history, history_file, indent=2)
    history_file.close()

    feature_path = os.path.join(exp_dir, "feature_cols.json")
    feature_file = open(feature_path, "w")
    json.dump(feature_cols, feature_file, indent=2)
    feature_file.close()

    item_map_path = os.path.join(exp_dir, "item_id_to_idx.json")
    item_map_file = open(item_map_path, "w")
    json.dump({str(int(k)): int(v) for k, v in item_id_to_idx.items()}, item_map_file, indent=2)
    item_map_file.close()

    build_params_path = os.path.join(exp_dir, "build_params.json")
    build_params_file = open(build_params_path, "w")
    json.dump(build_params, build_params_file, indent=2)
    build_params_file.close()

    train_params_path = os.path.join(exp_dir, "train_params.json")
    train_params_file = open(train_params_path, "w")
    json.dump(train_params, train_params_file, indent=2)
    train_params_file.close()

    print("Saved experiment →", exp_dir)

###############################################

def build_and_compile_model(featColsCount, itemCount, params):
    num_in = layers.Input(shape=(featColsCount,))
    item_in = layers.Input(shape=(), dtype="int32")

    emb = layers.Embedding(
        input_dim=itemCount,
        output_dim=params["embedding_dim"]
    )(item_in)

    x = layers.Concatenate()([num_in, layers.Flatten()(emb)])
    
    for neuronCount in params["layers"]:
        x = layers.Dense(neuronCount, activation=params["activation"])(x)

    out = layers.Dense(1, activation=params["output_activation"])(x)

    model = models.Model([num_in, item_in], out)

    optimizer_name = params.get("optimizer", "adam")
    learning_rate = params.get("learning_rate", 0.001)

    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(
        optimizer=optimizer,
        loss=params.get("loss", "mse"),
        metrics=params.get("metrics", ["mae"])
    )

    return model

###############################################

def get_feature_cols(encoded_df):
    feature_cols = []
    for c in encoded_df.columns:
        if c == "due_score":
            continue
        if c.endswith("_norm") or c.endswith("_sin") or c.endswith("_cos"):
            feature_cols.append(c)
    return feature_cols

###############################################

def train_model(model, encoded_df, feature_cols, params):

    featureCols = encoded_df[feature_cols].to_numpy(np.float32)
    indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
    targetVar = encoded_df["due_score"].to_numpy(np.float32)

    featureCols_tr, featureCols_te, indexCol_tr, indexCol_te, targetVar_tr, targetVar_te = train_test_split(
        featureCols, indexCol, targetVar, test_size=0.2, random_state=42
    )

    history = model.fit(
        [featureCols_tr, indexCol_tr],
        targetVar_tr,
        validation_split=params["validation_split"],
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        verbose=1
    )

    return history

###############################################

def build_prediction_df(encoded_df, combined_df, predict_date):

    if predict_date is None:
        predict_date = pd.Timestamp.today().normalize()
    else:
        predict_date = pd.to_datetime(predict_date).normalize()

    last_trip_date_by_item = (
        combined_df.sort_values("date")
        .groupby("itemId")["date"]
        .last()
    )

    last_purchase_date_by_item = (
        combined_df[combined_df["didBuy"] == 1]
        .sort_values("date")
        .groupby("itemId")["date"]
        .last()
    )

    item_lookup = (
    combined_df
    .dropna(subset=["item"])
    .groupby("itemId")["item"]
    .last()
    .to_dict()
    )

    rows = []

    for itemId, hist in encoded_df.groupby("itemId"):
        last = hist.iloc[-1].copy()
        row = last.to_dict()

        row["itemId"] = itemId
        row["item"] = item_lookup.get(itemId, "UNKNOWN")

        last_trip_date = pd.to_datetime(last_trip_date_by_item.loc[itemId]).normalize()
        row["daysSinceLastTrip_norm"] = (predict_date - last_trip_date).days

        if itemId in last_purchase_date_by_item.index:
            last_purchase_date = pd.to_datetime(last_purchase_date_by_item.loc[itemId]).normalize()
            row["daysSinceLastPurchase_norm"] = (predict_date - last_purchase_date).days

        row["daysUntilNextHoliday_norm"] = HolidayFeatures.daysUntilNextHoliday(predict_date)
        row["daysSinceLastHoliday_norm"] = HolidayFeatures.daysSinceLastHoliday(predict_date)
        row["holidayProximityIndex_norm"] = HolidayFeatures.holidayProximityIndex(predict_date)
        row["daysUntilSchoolStart_norm"] = HolidayFeatures.daysUntilSchoolStart(predict_date)
        row["daysUntilSchoolEnd_norm"] = HolidayFeatures.daysUntilSchoolEnd(predict_date)
        row["schoolSeasonIndex_norm"] = HolidayFeatures.schoolSeasonIndex(predict_date)

        row["year_norm"] = float(predict_date.year)
        row["day_norm"] = float(predict_date.day)
        row["quarter_norm"] = float(predict_date.quarter)

        row["dow_sin"], row["dow_cos"] = TemporalFeatures.encode_sin_cos(predict_date.weekday(), 7)
        row["month_sin"], row["month_cos"] = TemporalFeatures.encode_sin_cos(predict_date.month, 12)
        row["doy_sin"], row["doy_cos"] = TemporalFeatures.encode_sin_cos(predict_date.dayofyear, 365)

        rows.append(row)

    return pd.DataFrame(rows)

###############################################

def run_predictions(model, encoded_df, combined_df, feature_cols, predict_date=None):

    pred_df = build_prediction_df(encoded_df, combined_df, predict_date)

    featureCols = pred_df[feature_cols].to_numpy(np.float32)
    indexCol = pred_df["itemIdx"].to_numpy(np.int32)

    scores = model.predict([featureCols, indexCol], verbose=0).ravel()
    pred_df["due_intensity"] = scores

    return (
        pred_df[["itemId", "item", "due_intensity"]]
        .sort_values("due_intensity", ascending=False)
        .reset_index(drop=True)
    )

###############################################

def runExp(combined_df, encoded_df, buildParams, trainParams, baseDir, tripDate=None):
    item_ids = sorted(encoded_df["itemId"].unique())
    item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
    encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")

    feature_cols = get_feature_cols(encoded_df)

    model = build_and_compile_model(len(feature_cols), len(item_ids), buildParams)
    history = train_model(model, encoded_df, feature_cols, trainParams)

    predictions = run_predictions( model, encoded_df, combined_df, feature_cols, predict_date=tripDate)

    dataframes = {
        "predictions": predictions,
        "encoded_features": encoded_df,
        "combined_df": combined_df
    }

    save_experiment(model, history, dataframes, buildParams, trainParams, feature_cols, item_id_to_idx, base_dir=baseDir)

###############################################

def runPredictionOnly(modelDir, combined_df, encoded_df, predict_date, baseDir):

    model = tf.keras.models.load_model(os.path.join(modelDir, "model"))
    feature_cols = get_feature_cols(encoded_df)
    item_map_path = os.path.join(modelDir, "item_id_to_idx.json")
    with open(item_map_path, "r") as f:
        item_id_to_idx = {int(k): int(v) for k, v in json.load(f).items()}
    encoded_df = encoded_df.copy()
    encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
    
    predictions = run_predictions(model,encoded_df,combined_df,feature_cols,predict_date=predict_date)

    #exp_dir = os.path.join(baseDir, f"predict_{predict_date}")
    predict_date_str = pd.to_datetime(predict_date).strftime("%Y-%m-%d")
    exp_dir = os.path.join(baseDir, f"predict_{predict_date_str}")
    os.makedirs(exp_dir, exist_ok=True)

    predictions.to_csv(os.path.join(exp_dir, "predictions.csv"), index=False)

    print("Saved prediction →", exp_dir)


In [None]:
# ENTRY POINT

modelParamsList = []

params1 = {
    "trainParams": {
        "epochs": 30,
        "batch_size": 32,
        "validation_split": 0.1
    },
    "buildParams": {
        "embedding_dim": 32,
        "layers": [128, 64, 32],
        "activation": "relu",
        "output_activation": "linear",
        "optimizer": "adam",
        "learning_rate": 0.0001,
        "loss": "mse",
        "metrics": ["mae"]
    }
}

modelParamsList.append(params1)

params2 = {
    "trainParams": {
        "epochs": 30,
        "batch_size": 32,
        "validation_split": 0.1
    },
    "buildParams": {
        "embedding_dim": 5,
        "layers": [128, 64, 32],
        "activation": "relu",
        "output_activation": "linear",
        "optimizer": "adam",
        "learning_rate": 0.0001,
        "loss": "mse",
        "metrics": ["mae"]
    }
}

modelParamsList.append(params2)


params = {
    "trainParams": {
        "epochs": 50,
        "batch_size": 32,
        "validation_split": 0.1
    },
    "buildParams": {
        "embedding_dim": 32,
        "layers": [128, 64, 32],
        "activation": "relu",
        "output_activation": "linear",
        "optimizer": "adam",
        "learning_rate": 0.0001,
        "loss": "mse",
        "metrics": ["mae"]
    }
}
modelParamsList.append(params)

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
for modelParams in modelParamsList:
    print(f"{modelParams['buildParams']['layers']}")
    runExp(combined_df, encoded_df, modelParams["buildParams"], modelParams["trainParams"], f"exp/keras/{ts}")


[128, 64, 32]
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30

In [None]:


runPredictionOnly(
    "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
    combined_df = combined_df,
    encoded_df = encoded_df,
    predict_date = "12-19-2025",
    baseDir = "pred"
)

runPredictionOnly(
    "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
    combined_df = combined_df,
    encoded_df = encoded_df,
    predict_date = "12-20-2025",
    baseDir = "pred"
)

# runPredictionOnly(
#     "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
#     combined_df = combined_df,
#     encoded_df = encoded_df,
#     predict_date = "12/21/2025",
#     baseDir = "pred"
# )

# runPredictionOnly(
#     "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
#     combined_df = combined_df,
#     encoded_df = encoded_df,
#     predict_date = "12/22/2025",
#     baseDir = "pred"
# )

In [None]:
# import os
# import json
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from tensorflow.keras import layers, models
# from sklearn.model_selection import train_test_split

# ####################################################################################################

# def export_df(dataframes, dir):
#     for name, df in dataframes.items():
#         csv_path = os.path.join(dir, f"{name}.csv")
#         df.to_csv(csv_path, index=True)
# ##################################################################################

# def save_experiment(model, history, dataframes, build_params, train_params, feature_cols, item_id_to_idx, base_dir):
#     name_parts = []

#     if "embedding_dim" in build_params:
#         name_parts.append(f"emb{build_params['embedding_dim']}")

#     if "hiddenLayers" in build_params:
#         hl = "-".join(str(x) for x in build_params["hiddenLayers"])
#         name_parts.append(f"hl{hl}")

#     if "epochs" in train_params:
#         name_parts.append(f"ep{train_params['epochs']}")

#     exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
#     exp_dir = os.path.join(base_dir, exp_name)
#     os.makedirs(exp_dir, exist_ok=True)

#     export_df(dataframes, exp_dir)

#     model.save(os.path.join(exp_dir, "model"))
#     model.save_weights(os.path.join(exp_dir, "weights.h5"))

#     history_path = os.path.join(exp_dir, "history.json")
#     history_file = open(history_path, "w")
#     json.dump(history.history, history_file, indent=2)
#     history_file.close()

#     feature_path = os.path.join(exp_dir, "feature_cols.json")
#     feature_file = open(feature_path, "w")
#     json.dump(feature_cols, feature_file, indent=2)
#     feature_file.close()

#     item_map_path = os.path.join(exp_dir, "item_id_to_idx.json")
#     item_map_file = open(item_map_path, "w")
#     json.dump({str(int(k)): int(v) for k, v in item_id_to_idx.items()}, item_map_file, indent=2)
#     item_map_file.close()

#     build_params_path = os.path.join(exp_dir, "build_params.json")
#     build_params_file = open(build_params_path, "w")
#     json.dump(build_params, build_params_file, indent=2)
#     build_params_file.close()

#     train_params_path = os.path.join(exp_dir, "train_params.json")
#     train_params_file = open(train_params_path, "w")
#     json.dump(train_params, train_params_file, indent=2)
#     train_params_file.close()

#     print("Saved experiment →", exp_dir)
# ##########################################################################################

# def build_and_compile_model(featColsCount, itemCount, params):
#     num_in = layers.Input(shape=(featColsCount,))
#     item_in = layers.Input(shape=(), dtype="int32")

#     emb = layers.Embedding(
#         input_dim=itemCount,
#         output_dim=params["embedding_dim"]
#     )(item_in)

#     x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

#     for units in params["hiddenLayers"]:
#         x = layers.Dense(units, activation="relu")(x)

#     out = layers.Dense(
#         1,
#         activation=params.get("output_activation", "sigmoid")
#     )(x)

#     model = models.Model([num_in, item_in], out)

#     optimizer = tf.keras.optimizers.Adam(
#         learning_rate=params.get("learning_rate", 0.001)
#     )

#     model.compile(
#         optimizer=optimizer,
#         loss=params.get("loss", "mse"),
#         metrics=params.get("metrics", ["mae"])
#     )

#     return model
# ##########################################################################################

# def get_feature_cols(encoded_df):
#     feature_cols = []
#     for c in encoded_df.columns:
#         if c == "due_score":
#             continue
#         if c.endswith("_norm") or c.endswith("_sin") or c.endswith("_cos"):
#             feature_cols.append(c)
#     return feature_cols
# ##########################################################################################

# def train_model(model, encoded_df, feature_cols, params):

#     featureCols = encoded_df[feature_cols].to_numpy(np.float32)
#     indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
#     targetVar = encoded_df["due_score"].to_numpy(np.float32)

#     featureCols_tr, featureCols_te, indexCol_tr, indexCol_te, targetVar_tr, targetVar_te = train_test_split(
#         featureCols, indexCol, targetVar, test_size=0.2, random_state=42
#     )

#     history = model.fit(
#         [featureCols_tr, indexCol_tr],
#         targetVar_tr,
#         validation_split=params["validation_split"],
#         epochs=params["epochs"],
#         batch_size=params["batch_size"],
#         verbose=1
#     )

#     return history
# ##########################################################################################

# def build_prediction_df(encoded_df, combined_df, predict_date):

#     if predict_date is None:
#         predict_date = pd.Timestamp.today().normalize()
#     else:
#         predict_date = pd.to_datetime(predict_date).normalize()

#     last_trip_date_by_item = (combined_df.sort_values("date").groupby("itemId")["date"].last())

#     last_purchase_date_by_item = (combined_df[combined_df["didBuy"] == 1].sort_values("date").groupby("itemId")["date"].last() )

#     item_lookup = (combined_df[["itemId", "item"]].drop_duplicates().set_index("itemId")["item"].to_dict())

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1].copy()
#         row = last.to_dict()

#         row["itemId"] = itemId
#         row["item"] = item_lookup.get(itemId, "UNKNOWN")

#         last_trip_date = pd.to_datetime(last_trip_date_by_item.loc[itemId]).normalize()
#         row["daysSinceLastTrip_norm"] = (predict_date - last_trip_date).days

#         if itemId in last_purchase_date_by_item.index:
#             last_purchase_date = pd.to_datetime(last_purchase_date_by_item.loc[itemId]).normalize()
#             row["daysSinceLastPurchase_norm"] = (predict_date - last_purchase_date).days

#         row["daysUntilNextHoliday_norm"] = HolidayFeatures.daysUntilNextHoliday(predict_date)
#         row["daysSinceLastHoliday_norm"] = HolidayFeatures.daysSinceLastHoliday(predict_date)
#         row["holidayProximityIndex_norm"] = HolidayFeatures.holidayProximityIndex(predict_date)
#         row["daysUntilSchoolStart_norm"] = HolidayFeatures.daysUntilSchoolStart(predict_date)
#         row["daysUntilSchoolEnd_norm"] = HolidayFeatures.daysUntilSchoolEnd(predict_date)
#         row["schoolSeasonIndex_norm"] = HolidayFeatures.schoolSeasonIndex(predict_date)

#         row["year_norm"] = float(predict_date.year)
#         row["day_norm"] = float(predict_date.day)
#         row["quarter_norm"] = float(predict_date.quarter)

#         row["dow_sin"], row["dow_cos"] = TemporalFeatures.encode_sin_cos(predict_date.weekday(), 7)
#         row["month_sin"], row["month_cos"] = TemporalFeatures.encode_sin_cos(predict_date.month, 12)
#         row["doy_sin"], row["doy_cos"] = TemporalFeatures.encode_sin_cos(predict_date.dayofyear, 365)

#         rows.append(row)

#     return pd.DataFrame(rows)
# ##########################################################################################

# def run_predictions(model, encoded_df, combined_df, feature_cols, predict_date=None):

#     pred_df = build_prediction_df(encoded_df, combined_df, predict_date)

#     featureCols = pred_df[feature_cols].to_numpy(np.float32)
#     indexCol = pred_df["itemIdx"].to_numpy(np.int32)

#     scores = model.predict([featureCols, indexCol], verbose=0).ravel()
#     pred_df["due_intensity"] = scores

#     return (
#         pred_df[["itemId", "item", "due_intensity"]]
#         .sort_values("due_intensity", ascending=False)
#         .reset_index(drop=True)
#     )
# ##########################################################################################

# def runExp(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir, tripDate=None):
#     #
#     item_ids = sorted(encoded_df["itemId"].unique())
#     item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
#     itemCount = len(item_ids)
#     #
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]
#     featColsCount = len(numeric_cols)
#     #
#     model = build_and_compile_model(featColsCount, itemCount, buildParams)
#     #
#     history = train_model(model, encoded_df, trainParams)
#     #
#     predictions = run_predictions(
#         model,
#         encoded_df,
#         combined_df,
#         feature_stats,
#         predict_date=tripDate
#     )
#     #
#     dataframes = {
#         "predictions": predictions,
#         "encoded_features": encoded_df,
#         "combined_df": combined_df
#     }
#     save_experiment(model,history, dataframes, buildParams,trainParams,
#         numeric_cols,
#         item_id_to_idx,
#         base_dir=baseDir
#     )
# ####################################################################################################

# OLD OLD OLD OLD

In [None]:
# # ---------------- ENTRY POINT ----------------
# trainParams = {
#     "loss": "mse",
#     "optimizer": "adam",
#     "learning_rate": 0.0001,
#     "metrics": ["mae"],
#     "epochs": 40,
#     "batch_size": 32,
#     "validation_split": 0.1
# }

# buildParams_neurons_sigmoid = {
#     "embedding_dim": 32,
#     "hiddenLayers": [1],
#     "output_activation": "sigmoid"
# }

# paramSets = HiddenLayerParamSetBuilder.BuildHiddenLayerSizeSets(buildParams_neurons_sigmoid, 10, 5, 512)

# for eachBuildParams in paramSets:
#     print(f"{eachBuildParams['hiddenLayers']}")
#     runExp(combined_df, encoded_df, eachBuildParams, trainParams, "exp/keras/nuerons_sizes")


In [None]:
# feature_stats = {}
# RECOMPUTED = [
#     "daysSinceLastTrip",
#     "avgDaysBetweenTrips",
#     "daysUntilNextHoliday",
#     "daysSinceLastHoliday",
#     "holidayProximityIndex",
#     "daysUntilSchoolStart",
#     "daysUntilSchoolEnd",
#     "schoolSeasonIndex",
#     "year", "day", "quarter"
# ]

# for raw in RECOMPUTED:
#     col = raw + "_norm"
#     if col in encoded_df.columns:
#         std = encoded_df[col].std()
#         feature_stats[raw] = {
#             "mean": encoded_df[col].mean(),
#             "std": std if std != 0 else 1.0
#         }


In [None]:
# def export_df(dataframes, dir):
#     for name, df in dataframes.items():
#         csv_path = os.path.join(dir, f"{name}.csv")
#         df.to_csv(csv_path, index=True)
# ##################################################################################

# def save_experiment( model, history,  dataframes,  build_params, train_params, numeric_cols,item_id_to_idx,base_dir):
#     name_parts = []

#     if "embedding_dim" in build_params:
#         name_parts.append(f"emb{build_params['embedding_dim']}")

#     if "hiddenLayers" in build_params:
#         hl = "-".join(str(x) for x in build_params["hiddenLayers"])
#         name_parts.append(f"hl{hl}")

#     if "epochs" in train_params:
#         name_parts.append(f"ep{train_params['epochs']}")

#     exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
#     exp_dir = os.path.join(base_dir, exp_name)
#     os.makedirs(exp_dir, exist_ok=True)

#     export_df(dataframes, exp_dir)
#     # ------------------------------------------------------------
#     # Save model
#     # ------------------------------------------------------------
#     model.save(os.path.join(exp_dir, "model"))
#     model.save_weights(os.path.join(exp_dir, "weights.h5"))

#     # ------------------------------------------------------------
#     # Save history
#     # ------------------------------------------------------------
#     history_path = os.path.join(exp_dir, "history.json")
#     history_file = open(history_path, "w")
#     json.dump(history.history, history_file, indent=2)
#     history_file.close()

#     # ------------------------------------------------------------
#     # Save numeric features
#     # ------------------------------------------------------------
#     numeric_path = os.path.join(exp_dir, "numeric_features.json")
#     numeric_file = open(numeric_path, "w")
#     json.dump(numeric_cols, numeric_file, indent=2)
#     numeric_file.close()

#     # ------------------------------------------------------------
#     # Save item index mapping
#     # ------------------------------------------------------------
#     item_map_path = os.path.join(exp_dir, "item_id_to_idx.json")
#     item_map_file = open(item_map_path, "w")
#     json.dump(
#         {str(int(k)): int(v) for k, v in item_id_to_idx.items()},
#         item_map_file,
#         indent=2
#     )
#     item_map_file.close()

#     # ------------------------------------------------------------
#     # Save params
#     # ------------------------------------------------------------
#     build_params_path = os.path.join(exp_dir, "build_params.json")
#     build_params_file = open(build_params_path, "w")
#     json.dump(build_params, build_params_file, indent=2)
#     build_params_file.close()

#     train_params_path = os.path.join(exp_dir, "train_params.json")
#     train_params_file = open(train_params_path, "w")
#     json.dump(train_params, train_params_file, indent=2)
#     train_params_file.close()

#     # ------------------------------------------------------------
#     # Save predictions
#     # ------------------------------------------------------------
#     ##predictions.to_csv(os.path.join(exp_dir, "predictions.csv"), index=False)

#     print("Saved experiment →", exp_dir)
# ##########################################################################################

# def build_and_compile_model(featColsCount, itemCount, params):
#     num_in = layers.Input(shape=(featColsCount,))
#     item_in = layers.Input(shape=(), dtype="int32")

#     emb = layers.Embedding(
#         input_dim=itemCount,
#         output_dim=params["embedding_dim"]
#     )(item_in)

#     x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

#     for units in params["hiddenLayers"]:
#         x = layers.Dense(units, activation="relu")(x)

#     out = layers.Dense(
#         1,
#         activation=params.get("output_activation", "sigmoid")
#     )(x)

#     model = models.Model([num_in, item_in], out)

#     optimizer_name = params.get("optimizer", "adam")
#     learning_rate = params.get("learning_rate", 0.001)

#     if optimizer_name == "adam":
#         optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     else:
#         optimizer = optimizer_name

#     model.compile(
#         optimizer=optimizer,
#         loss=params.get("loss", "mse"),
#         metrics=params.get("metrics", ["mae"])
#     )

#     return model
# ##########################################################################################

# def train_model(model, encoded_df, params):
   
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     featureCols = encoded_df[numeric_cols].to_numpy(np.float32)
#     indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
#     targetVar  = encoded_df["due_score"].to_numpy(np.float32)

#     featuresCols_train, featuresCols_test, indexCol_train, indexCol_test, targetVar_tr, targetVar_te = train_test_split(featureCols, indexCol, targetVar, test_size=0.2, random_state=42)

#     history = model.fit(
#         [featuresCols_train, indexCol_train],
#         targetVar_tr,
#         validation_split=0.1,
#         epochs=params["epochs"],
#         batch_size=32,
#         verbose=1
#     )

#     return history
# ##########################################################################################

# def build_prediction_df(encoded_df, combined_df, feature_stats, predict_date):

#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     last_trip_date_by_item = (combined_df.sort_values("date").groupby("itemId")["date"].last())

#     last_purchase_date_by_item = (combined_df[combined_df["didBuy"] == 1].sort_values("date").groupby("itemId")["date"].last())

#     item_lookup = (combined_df[["itemId", "item"]].drop_duplicates().set_index("itemId")["item"].to_dict())

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1]

#         last_trip_date = pd.to_datetime(
#             last_trip_date_by_item.loc[itemId]
#         ).normalize()

#         last_purchase_date = pd.to_datetime(
#             last_purchase_date_by_item.loc[itemId]
#         ).normalize()

#         row = {
#             "itemId": itemId,
#             "item": item_lookup.get(itemId, "UNKNOWN"),
#             "itemIdx": int(last["itemIdx"])
#         }

#         # carry-forward observed features
#         for col in numeric_cols:
#             if (
#                 "temp_" in col or
#                 "feelsLike_" in col or
#                 "dew_" in col or
#                 "humidity_" in col or
#                 "precip_" in col or
#                 "freq_" in col or
#                 "habit" in col or
#                 "avgDaysBetweenPurchases" in col
#             ):
#                 row[col] = last[col]

#         raw_updates = {
#             "daysSinceLastTrip": (predict_date - last_trip_date).days,
#             "daysSinceLastPurchase": (predict_date - last_purchase_date).days,
#             "daysUntilNextHoliday": HolidayFeatures.daysUntilNextHoliday(predict_date),
#             "daysSinceLastHoliday": HolidayFeatures.daysSinceLastHoliday(predict_date),
#             "holidayProximityIndex": HolidayFeatures.holidayProximityIndex(predict_date),
#             "daysUntilSchoolStart": HolidayFeatures.daysUntilSchoolStart(predict_date),
#             "daysUntilSchoolEnd": HolidayFeatures.daysUntilSchoolEnd(predict_date),
#             "schoolSeasonIndex": HolidayFeatures.schoolSeasonIndex(predict_date),
#             "year": predict_date.year,
#             "day": predict_date.day,
#             "quarter": predict_date.quarter,
#             "dow_sin": np.sin(2 * np.pi * predict_date.weekday() / 7),
#             "dow_cos": np.cos(2 * np.pi * predict_date.weekday() / 7),
#             "month_sin": np.sin(2 * np.pi * predict_date.month / 12),
#             "month_cos": np.cos(2 * np.pi * predict_date.month / 12),
#             "doy_sin": np.sin(2 * np.pi * predict_date.dayofyear / 365),
#             "doy_cos": np.cos(2 * np.pi * predict_date.dayofyear / 365),
#         }

#         for raw, val in raw_updates.items():
#             norm_col = raw + "_norm"
#             if norm_col in numeric_cols and raw in feature_stats:
#                 stats = feature_stats[raw]
#                 row[norm_col] = (val - stats["mean"]) / stats["std"]

#         rows.append(row)

#     return pd.DataFrame(rows)
#     ######################################################################################
    
# def run_predictions(model, encoded_df, combined_df, feature_stats, predict_date=None):

#         if predict_date is None:
#             predict_date = pd.Timestamp.today().normalize()
#         else:
#             predict_date = pd.to_datetime(predict_date).normalize()
    
#         pred_df = build_prediction_df(
#             encoded_df, combined_df, feature_stats, predict_date
#         )
    
#         numeric_cols = [
#             c for c in pred_df.columns
#             if c.endswith("_norm")
#         ]
    
#         featureCols = pred_df[numeric_cols].to_numpy(np.float32)
#         indexCol = pred_df["itemIdx"].to_numpy(np.int32)
    
#         scores = model.predict([featureCols, indexCol], verbose=0).ravel()
#         pred_df["due_intensity"] = scores
    
#         return (
#             pred_df[["itemId", "item", "due_intensity"]]
#             .sort_values("due_intensity", ascending=False)
#             .reset_index(drop=True)
#         )
# #####################################################################################


# def BuildParamSets( baseline_params, property_name, start, step, stop):
#     """
#     Creates multiple fully independent parameter dictionaries by varying one property.
#     Each iteration produces a brand-new baseline object.
#     """
#     import copy
#     results = []

#     value = start
#     while value <= stop:
#         params_copy = copy.deepcopy(baseline_params)
#         params_copy[property_name] = value
#         results.append(params_copy)
#         value += step

#     return results
# ###############################################################################

# def runExp(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir):
#     #
#     # item index
#     item_ids = sorted(encoded_df["itemId"].unique())
#     item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
#     itemCount = len(item_ids)
#     #
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]
#     featColsCount = len(numeric_cols)
#     #
#     model = build_and_compile_model(featColsCount, itemCount, buildParams)
#     #
#     history = train_model(model, encoded_df, trainParams)
#     #
              
#     predictions = run_predictions(model, encoded_df, combined_df, feature_stats)
#     # 
#     dataframes = {
#         "predictions": predictions,
#         "encoded_features": encoded_df,
#         "combined_df": combined_df
#     }
#     save_experiment(model, history, dataframes, buildParams, trainParams, numeric_cols, item_id_to_idx, base_dir= baseDir)
# ################################################################################################################################

# import multiprocessing as mp
# def run_param_sets_multiprocess(buildParamsSets, trainParams, max_parallel, feature_stats, combined_df,encoded_df, baseDir ):
#     #
#     processes = []

#     for buildParams in buildParamsSets:
#         p = mp.Process(
#             target=runExp,
#             args=(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir)
#         )
#         p.start()
#         processes.append(p)

#         # limit concurrency
#         if len(processes) >= max_parallel:
#             for proc in processes:
#                 proc.join()
#             processes = []

#     # wait for remaining
#     for proc in processes:
#         proc.join()
# ################################################################################################################################

In [None]:
# buildParams_embeddingsTest = {
#     "embedding_dim": 1,
#     "hiddenLayers": [512],
#     "output_activation": "sigmoid"
# }

# # buildParams_embeddingsTest_relu = {
# #     "embedding_dim": 1,
# #     "hiddenLayers": [1024],
# #     "output_activation": "relu"
# # }


# trainParams = {
#     "loss": "mse",
#     "optimizer": "adam",
#     "learning_rate": 0.0001,
#     "metrics": ["mae"],
#     "epochs": 40,
#     "batch_size": 32,
#     "validation_split": 0.1
# }

# # build sets
# paramSets = BuildParamSets(buildParams_embeddingsTest, "embedding_dim", 33, 2, 64)
# # run
# run_param_sets_multiprocess(paramSets, trainParams, 4, feature_stats, combined_df,encoded_df, "exp_mp")
# #paramSets_embeddingeTest_relu = BuildParamSets(buildParams_embeddingsTest_relu, "embedding_dim", 1, 2, 32)

    

In [None]:



# def run_predictions( model, encoded_df, combined_df, feature_stats, predict_date=None):
#     """
#     Build one prediction row per item using:
#     - latest encoded feature state (encoded_df)
#     - raw timeline + names (combined_df)
#     - recomputed calendar features at predict_date
#     """

#     if predict_date is None:
#         predict_date = pd.Timestamp.today().normalize()
#     else:
#         predict_date = pd.to_datetime(predict_date).normalize()

#     # --------------------------------------------------------
#     # Discover numeric features (single source: encoded_df)
#     # --------------------------------------------------------
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     # --------------------------------------------------------
#     # Lookups from combined_df (single source of truth)
#     # --------------------------------------------------------
#     last_date_by_item = (
#         combined_df
#         .sort_values("date")
#         .groupby("itemId")["date"]
#         .last()
#     )

#     item_lookup = (
#         combined_df[["itemId", "item"]]
#         .drop_duplicates()
#         .set_index("itemId")["item"]
#         .to_dict()
#     )

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1]
#         last_date = pd.to_datetime(last_date_by_item.loc[itemId]).normalize()

#         row = {
#             "itemId": itemId,
#             "item": item_lookup.get(itemId, "UNKNOWN"),
#             "itemIdx": int(last["itemIdx"])
#         }

#         # ----------------------------------------------------
#         # Copy model-stable numeric features (already normalized)
#         # ----------------------------------------------------
#         for col in numeric_cols:
#             row[col] = last[col]

#         # ----------------------------------------------------
#         # Recompute DATE-SENSITIVE features
#         # ----------------------------------------------------
#         raw_updates = {
#             "daysSinceLastPurchase": (predict_date - last_date).days,
#             "daysUntilNextHoliday": HolidayFeatures.daysUntilNextHoliday(predict_date),
#             "daysSinceLastHoliday": HolidayFeatures.daysSinceLastHoliday(predict_date),
#             "holidayProximityIndex": HolidayFeatures.holidayProximityIndex(predict_date),
#             "daysUntilSchoolStart": HolidayFeatures.daysUntilSchoolStart(predict_date),
#             "daysUntilSchoolEnd": HolidayFeatures.daysUntilSchoolEnd(predict_date),
#             "schoolSeasonIndex": HolidayFeatures.schoolSeasonIndex(predict_date),
#             "year": predict_date.year,
#             "day": predict_date.day,
#             "quarter": predict_date.quarter
#         }

  
#         # ----------------------------------------------------
#         # Normalize recomputed features
#         # ----------------------------------------------------
#         for raw, val in raw_updates.items():
#             norm_col = raw + "_norm"
#             if norm_col in numeric_cols and raw in feature_stats:
#                 stats = feature_stats[raw]
#                 row[norm_col] = (val - stats["mean"]) / stats["std"]

#         rows.append(row)

#     pred_df = pd.DataFrame(rows)

#     Xn = pred_df[numeric_cols].to_numpy(np.float32)
#     Xi = pred_df["itemIdx"].to_numpy(np.int32)

#     scores = model.predict([Xn, Xi], verbose=0).ravel()

#     pred_df["due_intensity"] = scores

#     return (
#         pred_df[["itemId", "item", "due_intensity"]]
#         .sort_values("due_intensity", ascending=False)
#         .reset_index(drop=True)
#      )
# ###############################################################################



# tf.keras.backend.clear_session()


# # ------------------------------------------------------------
# # ENSURE itemIdx
# # ------------------------------------------------------------
# item_ids = sorted(encoded_df["itemId"].unique())
# item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
# encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
# NUM_ITEMS = len(item_ids)

# # ------------------------------------------------------------
# # FEATURES / TARGET
# # ------------------------------------------------------------
# numeric_cols = [
#     c for c in encoded_df.columns
#     if c.endswith("_norm") and c != "due_score"
# ]

# Xn = encoded_df[numeric_cols].to_numpy(np.float32)
# Xi = encoded_df["itemIdx"].to_numpy(np.int32)
# y  = encoded_df["due_score"].to_numpy(np.float32)

# # ------------------------------------------------------------
# # SPLIT
# # ------------------------------------------------------------
# Xn_tr, Xn_te, Xi_tr, Xi_te, y_tr, y_te = train_test_split(
#     Xn, Xi, y, test_size=0.2, random_state=42
# )

# # ------------------------------------------------------------
# # MODEL
# # ------------------------------------------------------------
# num_in = layers.Input(shape=(Xn_tr.shape[1],))
# itm_in = layers.Input(shape=(), dtype="int32")

# emb = layers.Embedding(NUM_ITEMS, 64)(itm_in)
# emb = layers.Flatten()(emb)

# x = layers.Concatenate()([num_in, emb])
# x = layers.Dense(4096, activation="relu")(x)
# #x = layers.Dense(2048, activation="relu")(x)
# out = layers.Dense(1, activation="sigmoid")(x)

# model = models.Model([num_in, itm_in], out)
# model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss="mse", metrics=["mae"])

# history = model.fit(
#     [Xn_tr, Xi_tr],
#     y_tr,
#     validation_split=0.1,
#     epochs=10,
#     batch_size=32,
#     verbose=1
# )

# # ------------------------------------------------------------
# # FEATURE STATS (ONLY recomputed features)
# # (NOTE: stats are for *_norm columns since inference writes *_norm)
# # ------------------------------------------------------------
# feature_stats = {}
# RECOMPUTED = [
#     "daysSinceLastPurchase",
#     "daysUntilNextHoliday",
#     "daysSinceLastHoliday",
#     "holidayProximityIndex",
#     "daysUntilSchoolStart",
#     "daysUntilSchoolEnd",
#     "schoolSeasonIndex",
#     "year", "day", "quarter",
#     "daysUntilBirthday_steve", "daysSinceBirthday_steve",
#     "daysUntilBirthday_maggie", "daysSinceBirthday_maggie",
#     "daysUntilBirthday_mil", "daysSinceBirthday_mil",
#     "daysUntilBirthday_angie", "daysSinceBirthday_angie",
# ]

# for raw in RECOMPUTED:
#     col = raw + "_norm"
#     if col in encoded_df.columns:
#         std = encoded_df[col].std()
#         feature_stats[raw] = {
#             "mean": encoded_df[col].mean(),
#             "std": std if std != 0 else 1.0
#         }

# # ------------------------------------------------------------
# # BIRTHDAYS
# # ------------------------------------------------------------
# BIRTHDAYS = { "steve":  "03-05-1980", "maggie": "03-03-2016","mil": "01-27-1962", "angie":  "08-11-1981"}
# birthdays = {k: pd.to_datetime(v) for k, v in BIRTHDAYS.items()}

# # ------------------------------------------------------------
# # PREDICT (UPDATED CALL)
# # ------------------------------------------------------------
# predictions = run_predictions(model=model, encoded_df=encoded_df, combined_df=combined_df, feature_stats=feature_stats, birthdays=birthdays, predict_date=None)

# # ------------------------------------------------------------
# # SAVE
# # ------------------------------------------------------------
# save_experiment( model=model, history=history, predictions=predictions, params={}, numeric_cols=numeric_cols, item_id_to_idx=item_id_to_idx)

# predictions.head(50)
