In [1]:


import pandas as pd
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import os
from datetime import datetime
import asyncio
import json

import gc
import tensorflow as tf
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from temporal_features import TemporalFeatures
from holiday_features import HolidayFeatures
from wallmart_rcpt_parser import WallmartRecptParser
from winn_dixie_recpt_parser import WinnDixieRecptParser 
from hidden_layer_param_builder import HiddenLayerParamSetBuilder
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)

print(os.getcwd())
print("GPUs Available:", tf.config.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

C:\Users\steve\source\repos\grocery-ml
GPUs Available: []


In [2]:

def export_df_to_excel_table(df, file_path, sheet_name="Data"):
    """
    Export a pandas DataFrame to an Excel file as a proper Excel Table
    with no duplicated header rows.
    """
    from openpyxl import load_workbook
    from openpyxl.worksheet.table import Table, TableStyleInfo

    df.to_excel(file_path, sheet_name=sheet_name, index=False)

    workbook = load_workbook(file_path)
    worksheet = workbook[sheet_name]

    end_row = worksheet.max_row
    end_col = worksheet.max_column
    end_col_letter = worksheet.cell(row=1, column=end_col).column_letter

    table_ref = f"A1:{end_col_letter}{end_row}"
    table = Table(displayName="DataTable", ref=table_ref)

    style = TableStyleInfo(
        name="TableStyleMedium9",
        showFirstColumn=False,
        showLastColumn=False,
        showRowStripes=True,
        showColumnStripes=False
    )

    table.tableStyleInfo = style
    worksheet.add_table(table)

    workbook.save(file_path)

    ###########################################################################################




def normalizeAndDropCols(df, cols):
    for col in cols:
        # Replace the sentinel 999 with NaN so it doesn't distort mean/std
        df[col] = df[col].replace(999, np.nan)

        # Compute mean/std ignoring NaN
        mean = df[col].mean()
        std  = df[col].std() or 1.0

        # Normalize
        df[col + "_norm"] = (df[col] - mean) / std

        # After normalization: missing values become 0 (neutral)
        df[col + "_norm"] = df[col + "_norm"].fillna(0.0)

    return df.drop(columns=cols)


#def normalizeAndDropCols(df, cols):
#    for col in cols:
#        std = df[col].std() or 1.0
#        df[col + "_norm"] = (df[col] - df[col].mean()) / std
#    return df.drop(columns=cols)



def canonicalize_items(df, patterns, canonical_name):
    """
    For each pattern in `patterns`, find rows where `item` contains the pattern
    and replace df['item'] with `canonical_name`.
    """
    for p in patterns:
        mask = df["item"].str.contains(p, case=False, na=False)
        df.loc[mask, "item"] = canonical_name


In [3]:
# --- WEATHER PREP ---
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols)

df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime").sort_index()

df_weather["temp_5day_avg_feat"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg_feat"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg_feat"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg_feat"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg_feat"] = df_weather["precip"].rolling(5, min_periods=1).mean()

df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])

# convert index to date for merging
df_weather["date"] = df_weather.index.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather = df_weather.set_index("date")


In [4]:
rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            #"manager": result["manager"],
            #"cashier": result["cashier"],
            "item": r["item"]
            #"qty": r["qty"],
            #"reg": r["reg"],
            #"youPay": r["youPay"],
            #"reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            #"qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

winndixie_df = pd.DataFrame(rows)

winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])
winndixie_df["time"] = winndixie_df["time"].astype(str)

winndixie_df = WinnDixieRecptParser.remove_duplicate_receipt_files(winndixie_df)

winndixie_df = winndixie_df.sort_values(by=["date", "time"]).reset_index(drop=True)
winndixie_df = winndixie_df.drop(columns=["time"])

DUP: 2025-08-02 00:00:00 10:29 PM → keep IMG_9693.txt ← drop IMG_9694.txt
DUP: 2025-10-07 00:00:00 6:06 PM → keep IMG_0017.txt ← drop IMG_9669.txt
DUP: 2025-10-14 00:00:00 4:06 PM → keep IMG_0015.txt ← drop IMG_9667.txt
DUP: 2025-10-14 00:00:00 6:08 PM → keep IMG_0014.txt ← drop IMG_9666.txt
DUP: 2025-10-17 00:00:00 9:18 PM → keep IMG_0013.txt ← drop IMG_9664.txt


In [5]:

wallmart_raw = WallmartRecptParser.ImportWallMart("./walmart")

## rename cols
wallmart_df = wallmart_raw[["Order Date","Product Description", "source"]].copy()
wallmart_df = wallmart_df.rename(columns={
    "Order Date": "date",
    "Product Description": "item"
})

wallmart_df["date"] = pd.to_datetime(wallmart_df["date"])
winndixie_df["date"] = pd.to_datetime(winndixie_df["date"])

combined_df = pd.concat(
    [winndixie_df, wallmart_df[["date", "item", "source"]]],
    ignore_index=True
)

# remove - 
combined_df["item"] = (combined_df["item"]
        .str.replace(r"^\s*[-–—]\s*", "", regex=True)
        .str.strip()
)



In [6]:
milk_patterns = ["know-and-love-milk", "kandl-milk", "prairie-farm-milk","kleinpeter-milk", "kl-milk", "Milk, Fat Free,", "Fat-Free Milk"]
canonicalize_items(combined_df, milk_patterns, "milk")

bread_patterns = ["bunny-bread","se-grocers-bread","seg-sandwich-bread", "seg-white-bread"]
canonicalize_items(combined_df, bread_patterns, "bread")

cheese_patterns = ["dandw-cheese", "kraft-cheese", "se-grocers-cheese", "know-and-love-cheese"]
canonicalize_items(combined_df, cheese_patterns, "cheese")

mayo_patterns = ["blue-plate-mayo", "blue-plate-mynnase"]
canonicalize_items(combined_df, mayo_patterns, "mayo")

chicken_patterns = ["chicken-cutlet", "chicken-leg", "chicken-thigh", "chicken-thighs"]
canonicalize_items(combined_df, chicken_patterns, "chicken")

yogurt_patterns = ["chobani-yogrt-flip", "chobani-yogurt"]
canonicalize_items(combined_df, yogurt_patterns, "yogurt")

coke_patterns = ["coca-cola", "coca-cola-cola", "cocacola-soda", "coke", "cola"]
canonicalize_items(combined_df, coke_patterns, "coke")

hugbi_patterns = ["hugbi-pies", "-hugbi-pies"]
canonicalize_items(combined_df, hugbi_patterns, "hugbi-pies")

ceralPaterns  = ["ceral"]
canonicalize_items(combined_df, ceralPaterns, "ceral")

minute_maid_patterns = ["minute-maid-drink", "minute-maid-drinks", "minute-maid-lmnade"]
canonicalize_items(combined_df, minute_maid_patterns, "minute-maid-drink")

eggs_pattern = ["egglands-best-egg", "egglands-best-eggs", "eggs"]
canonicalize_items(combined_df, eggs_pattern, "eggs")



In [7]:
from dataset_utils import DatasetUtils

combined_df, id_to_item = DatasetUtils.CreateItemId(combined_df)


In [8]:
# ============================================================
# Build full receipt × item table WITHOUT using qty
# ============================================================

# 1. Mark actual purchases in the raw receipt rows
combined_df["didBuy_target"] = 1

# 2. Build complete grid
all_items = combined_df["itemId"].unique()
all_dates = combined_df["date"].unique()

full = (
    pd.MultiIndex.from_product(
        [all_dates, all_items], 
        names=["date", "itemId"]
    ).to_frame(index=False)
)

# 3. Merge raw purchases onto the full grid
df_full = full.merge(
    combined_df[["date", "itemId", "item", "source", "didBuy_target"]],
    on=["date", "itemId"],
    how="left"
)

# 4. Fill missing purchases with didBuy=0
df_full["didBuy_target"] = df_full["didBuy_target"].fillna(0).astype(int)

# 5. NOW REPLACE combined_df with df_full
combined_df = df_full.copy()

In [9]:
# 1. Build grouped table (one row per trip date)

grouped = ( combined_df[["date"]]
    .drop_duplicates()
    .sort_values("date")
    .reset_index(drop=True)
)



grouped["daysSinceLastTrip_feat"] = TemporalFeatures.DaysSinceLastTrip(grouped)
grouped["avgDaysBetweenTrips_feat"] = TemporalFeatures.AvgDaysBetweenTrips(grouped)

# 3. Holiday / School features
grouped["daysUntilNextHoliday_feat"] = grouped["date"].apply(HolidayFeatures.daysUntilNextHoliday)
grouped["daysSinceLastHoliday_feat"] = grouped["date"].apply(HolidayFeatures.daysSinceLastHoliday)
grouped["holidayProximityIndex_feat"] = grouped["date"].apply(HolidayFeatures.holidayProximityIndex)
grouped["daysUntilSchoolStart_feat"] = grouped["date"].apply(HolidayFeatures.daysUntilSchoolStart)
grouped["daysUntilSchoolEnd_feat"]   = grouped["date"].apply(HolidayFeatures.daysUntilSchoolEnd)
grouped["schoolSeasonIndex_feat"]    = grouped["date"].apply(HolidayFeatures.schoolSeasonIndex)


grouped = TemporalFeatures.CreateDateFeatures(grouped)

# merge in weather
grouped = grouped.merge(df_weather, on="date", how="left")

combined_df = combined_df.merge(grouped, on="date", how="left")
combined_df.info()
combined_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   date                        121240 non-null  datetime64[ns]
 1   itemId                      121240 non-null  int64         
 2   item                        1436 non-null    object        
 3   source                      1436 non-null    object        
 4   didBuy_target               121240 non-null  int32         
 5   daysSinceLastTrip_feat      121240 non-null  float64       
 6   avgDaysBetweenTrips_feat    121240 non-null  float64       
 7   daysUntilNextHoliday_feat   121240 non-null  int64         
 8   daysSinceLastHoliday_feat   121240 non-null  int64         
 9   holidayProximityIndex_feat  121240 non-null  float64       
 10  daysUntilSchoolStart_feat   121240 non-null  int64         
 11  daysUntilSchoolEnd_feat     121240 non-

Unnamed: 0,date,itemId,item,source,didBuy_target,daysSinceLastTrip_feat,avgDaysBetweenTrips_feat,daysUntilNextHoliday_feat,daysSinceLastHoliday_feat,holidayProximityIndex_feat,daysUntilSchoolStart_feat,daysUntilSchoolEnd_feat,schoolSeasonIndex_feat,year_feat,month_cyc_feat,day_cyc_feat,dow_cyc_feat,doy_feat,quarter_feat,temp_5day_avg_feat,feelsLike_5day_avg_feat,dew_5day_avg_feat,humidity_5day_avg_feat,precip_5day_avg_feat
0,2024-11-15,695,spaghettios-pasta,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
1,2024-11-15,439,coke,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
2,2024-11-15,439,coke,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
3,2024-11-15,548,mandms-candies,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
4,2024-11-15,547,mandarins,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
5,2024-11-15,696,sparkling-ice-wtr,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
6,2024-11-15,696,sparkling-ice-wtr,IMG_9764.txt,1,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
7,2024-11-15,491,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
8,2024-11-15,551,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162
9,2024-11-15,411,,,0,29.0,19.764706,13,4,-0.866667,273,197,0.460274,2024,11,15,4,320,4,72.08,72.28,65.26,80.58,0.1162


In [10]:
 def fill_freq(group):
        group = group.copy()
        group = group.sort_values("date").reset_index(drop=True)
    
        history = []
    
        col_date = group.columns.get_loc("date")
        col_buy = group.columns.get_loc("didBuy_target")
        col_freq = {w: group.columns.get_loc(f"freq_{w}_feat") for w in freq_windows}
    
        for i in range(len(group)):
            cur_date = group.iat[i, col_date]
    
            # record purchase
            if group.iat[i, col_buy] == 1:
                history.append(cur_date)
    
            # prune history ONCE using largest window
            cutoff_max = cur_date - pd.Timedelta(days=max_w)
            history = [d for d in history if d >= cutoff_max]
    
            # compute windowed counts
            for w in freq_windows:
                cutoff = cur_date - pd.Timedelta(days=w)
                count = 0
                for d in history:
                    if d >= cutoff:
                        count += 1
                group.iat[i, col_freq[w]] = count
    
        return group
####################################################################

freq_windows = [7, 15, 30, 90, 365]
max_w = max(freq_windows)

# initialize columns
for w in freq_windows:
    combined_df[f"freq_{w}_feat"] = np.nan

combined_df = (
    combined_df
    .groupby("itemId", group_keys=False)
    .apply(fill_freq)
)
###########################

combined_df["freq7_over30_feat"], combined_df["freq30_over365_feat"]  = TemporalFeatures.compute_freq_ratios(combined_df["freq_7_feat"],combined_df["freq_30_feat"], combined_df["freq_365_feat"])



  .apply(fill_freq)


In [11]:
combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

combined_df["daysSinceLastPurchase_feat"] = np.nan

# Track last purchase date per item
last_purchase_date = {}

for i in range(len(combined_df)):
    itemId = combined_df.at[i, "itemId"]
    current_date = combined_df.at[i, "date"]

    if itemId in last_purchase_date:
        combined_df.at[i, "daysSinceLastPurchase_feat"] = (
            current_date - last_purchase_date[itemId]
        ).days
    else:
        combined_df.at[i, "daysSinceLastPurchase_feat"] = 0  # or 999 if you prefer

    if combined_df.at[i, "didBuy_target"] == 1:
        last_purchase_date[itemId] = current_date


####################

combined_df = combined_df.sort_values(["itemId", "date"]).reset_index(drop=True)

# Purchase-to-purchase gaps
purchase_gap = (
    combined_df
        .where(combined_df["didBuy_target"] == 1)
        .groupby("itemId")["date"]
        .diff()
        .dt.days
)

# Expanding (lifetime-so-far) average per item
avg_gap = (
    purchase_gap
        .groupby(combined_df["itemId"])
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
)

# Attach + forward-fill so ALL rows have a value
combined_df["avgDaysBetweenPurchases_feat"] = (
    avg_gap
        .groupby(combined_df["itemId"])
        .ffill()
        .fillna(0)
)

#export_df_to_excel_table(combined_df, "combined_df.xlsx", "combined_df") 
##combined_df.to_csv("combined_df.csv")

In [12]:
# ============================================================
# ITEM-LEVEL HABIT FEATURES (TF-IDF ANALOG)
# ============================================================
import numpy as np
import pandas as pd

def build_habit_features(df, tau_days=120):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    total_trips = df["date"].nunique()
    timeline_days = (df["date"].max() - df["date"].min()).days or 1

    rows = []

    for itemId, g in df.groupby("itemId"):
        buys = g[g["didBuy_target"] == 1]["date"]

        if len(buys) == 0:
            rows.append({
                "itemId": itemId,
                "habitFrequency_feat": 0.0,
                "habitSpan_feat": 0.0,
                "habitDecay_feat": 0.0,
            })
            continue

        first = buys.min()
        last = buys.max()

        habitFrequency = len(buys) / total_trips
        habitSpan = (last - first).days / timeline_days
        days_since_last = (df["date"].max() - last).days
        habitDecay = np.exp(-days_since_last / tau_days)

        rows.append({
            "itemId": itemId,
            "habitFrequency_feat": habitFrequency,
            "habitSpan_feat": habitSpan,
            "habitDecay_feat": habitDecay,
        })

    return pd.DataFrame(rows)
###############################################################################


def compute_due_score(df,itemId=None,use_sigmoid=True,normalize=False, weights=None):
    """
    """

    if weights is None:
        weights = {
            "daysSinceLastPurchase_feat": 1.5,
            "freq_30": 1.0,
            "freq_90": 0.5
        }

    # --------------------------------------------------------
    # Optional itemId filter
    # --------------------------------------------------------
    if itemId is not None:
        df = df[df["itemId"] == itemId].copy()
    else:
        df = df.copy()

    # --------------------------------------------------------
    # RAW linear score (pre-normalization)
    # --------------------------------------------------------
    df["due_score_raw"] = (
        weights["daysSinceLastPurchase_feat"] * df["daysSinceLastPurchase_feat"]
      + weights["freq_30_feat"]              * df["freq_30_feat"]
      + weights["freq_90_feat"]              * df["freq_90_feat"]
    )

    # --------------------------------------------------------
    # Final due_score
    # --------------------------------------------------------
    if use_sigmoid:
        df["due_score_feat"] = 1 / (1 + np.exp(-df["due_score_raw"]))

    elif normalize:
        mean = df["due_score_raw"].mean()
        std  = df["due_score_raw"].std() or 1.0
        df["due_score"] = (df["due_score_raw"] - mean) / std

    else:
        df["due_score"] = df["due_score_raw"]

    return df
###############################################################################


# ============================================================
# MERGE HABIT FEATURES
# ============================================================
habit_df = build_habit_features(combined_df)

combined_df = combined_df.merge(habit_df, on="itemId",how="left")

combined_df[["habitFrequency_feat", "habitSpan_feat", "habitDecay_feat"]] = (
    combined_df[["habitFrequency_feat", "habitSpan_feat", "habitDecay_feat"]].fillna(0.0)
)




In [13]:
def compute_due_ratio(df, cap=3.0):
    ratio = df["daysSinceLastPurchase_feat"] / df["avgDaysBetweenPurchases_feat"]
    ratio = ratio.replace([np.inf, -np.inf], np.nan).fillna(0)
    return ratio.clip(0, cap)
###############################################################################

combined_df["item_due_ratio_feat"] = compute_due_ratio(combined_df)

#combined_df["purchaseToTripRatio"] = combined_df["daysSinceLastPurchase"] / combined_df["avgDaysBetweenPurchases"]


# encoded_df["due_score"] = (
#     1.5 * encoded_df["daysSinceLastPurchase_norm"]
#   + 1.0 * encoded_df["freq_30_norm"]
#   + 0.5 * encoded_df["freq_90_norm"]
# )

#encoded_df["due_score"] = 1 / (1 + np.exp(-encoded_df["due_score"]))
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121240 entries, 0 to 121239
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   date                          121240 non-null  datetime64[ns]
 1   itemId                        121240 non-null  int64         
 2   item                          1436 non-null    object        
 3   source                        1436 non-null    object        
 4   didBuy_target                 121240 non-null  int32         
 5   daysSinceLastTrip_feat        121240 non-null  float64       
 6   avgDaysBetweenTrips_feat      121240 non-null  float64       
 7   daysUntilNextHoliday_feat     121240 non-null  int64         
 8   daysSinceLastHoliday_feat     121240 non-null  int64         
 9   holidayProximityIndex_feat    121240 non-null  float64       
 10  daysUntilSchoolStart_feat     121240 non-null  int64         
 11  daysUntilScho

In [None]:
# ============================================================
# NORMALIZE 
# ============================================================


freq_cols = [c for c in combined_df.columns if c.startswith("freq")]
weather_cols = [c for c in combined_df.columns if c.endswith("_5day_avg")]
holiday_cols = [c for c in combined_df.columns if "holiday" in c.lower()]
school_cols = [c for c in combined_df.columns if "school" in c.lower()]

daysSince_purchase_cols = [c for c in combined_df.columns if "days" in c.lower() and "purchase" in c.lower()]
daysSince_trip_cols     = [c for c in combined_df.columns if "days" in c.lower() and "trip" in c.lower()]

days_cols = daysSince_purchase_cols + daysSince_trip_cols

habit_cols = ["habitFrequency", "habitSpan", "habitDecay"]

normalized_df = combined_df.copy()

normalized_df = normalizeAndDropCols(normalized_df, ["item_due_ratio"])
normalized_df = normalizeAndDropCols(normalized_df, freq_cols)
normalized_df = normalizeAndDropCols(normalized_df, weather_cols)
normalized_df = normalizeAndDropCols(normalized_df, holiday_cols)
normalized_df = normalizeAndDropCols(normalized_df, school_cols)
normalized_df = normalizeAndDropCols(normalized_df, days_cols)
normalized_df = normalizeAndDropCols(normalized_df, habit_cols)

# ---------- CYCLICAL FEATURES ----------

normalized_df["dow_sin"], normalized_df["dow_cos"] = TemporalFeatures.encode_sin_cos( normalized_df["dow"], 7.0)
normalized_df["month_sin"], normalized_df["month_cos"] = TemporalFeatures.encode_sin_cos(normalized_df["month"], 12.0)
normalized_df["doy_sin"], normalized_df["doy_cos"] = TemporalFeatures.encode_sin_cos(normalized_df["doy"], 365.0)

normalized_df = normalized_df.drop(columns=["dow", "month", "doy"], errors="ignore")

# ---------- NON-CYCLIC TIME FEATURES ----------
nonCycCols = ["year", "day", "quarter"]
normalized_df = normalizeAndDropCols(normalized_df, nonCycCols)

# ---------- DROP NON-MODEL COLS ----------
cols_to_drop = ["source", "item", "date"]
normalized_df = normalized_df.drop(columns=cols_to_drop, errors="ignore")

export_df_to_excel_table(normalized_df, "normalized_df.xlsx", "normalized_df") 

# TRAIN / BUILD MODEL

In [None]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

###############################################

def export_df(dataframes, dir):
    for name, df in dataframes.items():
        csv_path = os.path.join(dir, f"{name}.csv")
        xlsxPath = os.path.join(dir, f"{name}.xlsx")
        print(f"Writing CSV: {csv_path}")
        df.to_csv(csv_path, index=True)
        print(f"Writing XLSX: {xlsxPath}")
        export_df_to_excel_table(df, xlsxPath, sheet_name=f"{name}")


###############################################

def save_experiment(model, history, dataframes, build_params, train_params, feature_cols, item_id_to_idx, base_dir):
    name_parts = []

   
    if "embedding_dim" in build_params:
        name_parts.append(f"emb{build_params['embedding_dim']}")

    if "layers" in build_params:
        hl = "-".join(str(x) for x in build_params["layers"])
        name_parts.append(f"hl{hl}")

    if "epochs" in train_params:
        name_parts.append(f"ep{train_params['epochs']}")

    if "output_activation" in build_params:
        name_parts.append(f"outAct_{build_params['output_activation']}")

    exp_name = "__".join(name_parts) if name_parts else "exp_unlabeled"
    exp_dir = os.path.join(base_dir, exp_name)
    print("Saving Exp: ", exp_dir)
    
    os.makedirs(exp_dir, exist_ok=True)

    export_df(dataframes, exp_dir)

    model.save(os.path.join(exp_dir, "model"))
    model.save_weights(os.path.join(exp_dir, "weights.h5"))

    history_path = os.path.join(exp_dir, "history.json")
    history_file = open(history_path, "w")
    json.dump(history.history, history_file, indent=2)
    history_file.close()

    feature_path = os.path.join(exp_dir, "feature_cols.json")
    feature_file = open(feature_path, "w")
    json.dump(feature_cols, feature_file, indent=2)
    feature_file.close()

    item_map_path = os.path.join(exp_dir, "item_id_to_idx.json")
    item_map_file = open(item_map_path, "w")
    json.dump({str(int(k)): int(v) for k, v in item_id_to_idx.items()}, item_map_file, indent=2)
    item_map_file.close()

    build_params_path = os.path.join(exp_dir, "build_params.json")
    build_params_file = open(build_params_path, "w")
    json.dump(build_params, build_params_file, indent=2)
    build_params_file.close()

    train_params_path = os.path.join(exp_dir, "train_params.json")
    train_params_file = open(train_params_path, "w")
    json.dump(train_params, train_params_file, indent=2)
    train_params_file.close()

    print("Saved experiment →", exp_dir)

###############################################

def build_and_compile_model(featColsCount, itemCount, params):
    num_in = layers.Input(shape=(featColsCount,))
    item_in = layers.Input(shape=(), dtype="int32")

    print(f"featColsCount is : {featColsCount}")
    
    emb = layers.Embedding(
        input_dim=itemCount,
        output_dim=params["embedding_dim"]
    )(item_in)

    x = layers.Concatenate()([num_in, layers.Flatten()(emb)])
    
    for neuronCount in params["layers"]:
        x = layers.Dense(neuronCount, activation=params["activation"])(x)

    out = layers.Dense(1, activation=params["output_activation"])(x)

    model = models.Model([num_in, item_in], out)

    optimizer_name = params.get("optimizer", "adam")
    learning_rate = params.get("learning_rate")

    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(
        optimizer=optimizer,
        loss=params.get("loss", "mse"),
        metrics=params.get("metrics", ["mae"])
    )

    return model

###############################################

def get_feature_cols(encoded_df):
    feature_cols = []
    for c in encoded_df.columns:
        if c == "due_score":
            continue
        if c.endswith("_norm") or c.endswith("_sin") or c.endswith("_cos"):
            feature_cols.append(c)
    return feature_cols

###############################################

def train_model(model, encoded_df, feature_cols, params):

    featureCols = encoded_df[feature_cols].to_numpy(np.float32)
    indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
    #targetVar = encoded_df["due_score"].to_numpy(np.float32)
    targetVar = encoded_df["didBuy"]

    featureCols_tr, featureCols_te, indexCol_tr, indexCol_te, targetVar_tr, targetVar_te = train_test_split(
        featureCols, indexCol, targetVar, test_size=0.2, random_state=42
    )

    history = model.fit(
        [featureCols_tr, indexCol_tr],
        targetVar_tr,
        validation_split=.1,
        epochs=params["epochs"],
        batch_size=32,
        verbose=1
    )

    return history

###############################################

In [None]:
###############################################
def fit_normalization_params(combined_df):
    params = {}
    feature_cols = [c for c in combined_df.columns if c.endswith("_feat")]
    cyc_cols = [c for c in feature_cols if c.endswith("_cyc_feat")]
    num_cols = [c for c in feature_cols if c not in cyc_cols]

    for col in num_cols:
        params[col] = {
            "mean": combined_df[col].mean(),
            "std": combined_df[col].std()
        }

    for col in cyc_cols:
        params[col] = {
            "period": TemporalFeatures.get_period_for_column(col)
        }

    return params
###############################################


###############################################
def normalize_features(combined_df, norm_params):
    normalized_df = combined_df.copy()

    for col, cfg in norm_params.items():

        if col.endswith("_cyc_feat"):
            sin_col, cos_col = TemporalFeatures.encode_sin_cos(
                combined_df[col], cfg["period"]
            )
            normalized_df[f"{col}_sin_norm"] = sin_col
            normalized_df[f"{col}_cos_norm"] = cos_col
            normalized_df.drop(columns=[col], inplace=True)

        else:
            mean_val = cfg["mean"]
            std_val = cfg["std"]
            norm_col = col.replace("_feat", "_norm")

            if std_val == 0:
                normalized_df[norm_col] = 0.0
            else:
                normalized_df[norm_col] = (combined_df[col] - mean_val) / std_val

            normalized_df.drop(columns=[col], inplace=True)

    return normalized_df
###############################################


###############################################
def build_and_compile_model(feat_cols_count, item_count, build_params):
    num_in = layers.Input(shape=(feat_cols_count,))
    item_in = layers.Input(shape=(), dtype="int32")

    emb = layers.Embedding(
        input_dim=item_count,
        output_dim=build_params["embedding_dim"]
    )(item_in)

    x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

    for neuron_count in build_params["layers"]:
        x = layers.Dense(neuron_count, activation=build_params["activation"])(x)

    out = layers.Dense(1, activation=build_params["output_activation"])(x)

    model = models.Model(inputs=[num_in, item_in], outputs=out)

    optimizer_name = build_params.get("optimizer", "adam")
    learning_rate = build_params.get("learning_rate")

    if optimizer_name == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(
        optimizer=optimizer,
        loss=build_params.get("loss", "mse"),
        metrics=build_params.get("metrics", ["mae"])
    )

    return model
###############################################


###############################################
def train_model(model, df, feature_cols, target_col, train_params):
    x_feat = df[feature_cols].to_numpy(np.float32)
    x_item = df["itemId"].to_numpy(np.int32)
    y = df[target_col].to_numpy(np.float32)

    x_feat_tr, x_feat_te, x_item_tr, x_item_te, y_tr, y_te = train_test_split(
        x_feat, x_item, y, test_size=0.2, random_state=42
    )

    history = model.fit(
        [x_feat_tr, x_item_tr],
        y_tr,
        validation_split=0.1,
        epochs=train_params["epochs"],
        batch_size=train_params.get("batch_size", 32),
        verbose=1
    )

    return history
###############################################


###############################################
def build_prediction_input_df(combined_df, prediction_date, norm_params):
    latest_rows = (
        combined_df.sort_values("date")
        .groupby("itemId")
        .tail(1)
        .copy()
    )

    latest_rows["date"] = prediction_date

    latest_rows["daysSinceLastTrip_feat"] = TemporalFeatures.daysSinceLastTrip(prediction_date)
    latest_rows["avgDaysBetweenTrips_feat"] = TemporalFeatures.avgDaysBetweenTrips(prediction_date)

    latest_rows["daysUntilNextHoliday_feat"] = HolidayFeatures.daysUntilNextHoliday(prediction_date)
    latest_rows["daysSinceLastHoliday_feat"] = HolidayFeatures.daysSinceLastHoliday(prediction_date)
    latest_rows["holidayProximityIndex_feat"] = HolidayFeatures.holidayProximityIndex(prediction_date)

    latest_rows["daysUntilSchoolStart_feat"] = SchoolFeatures.daysUntilSchoolStart(prediction_date)
    latest_rows["daysUntilSchoolEnd_feat"] = SchoolFeatures.daysUntilSchoolEnd(prediction_date)
    latest_rows["schoolSeasonIndex_feat"] = SchoolFeatures.schoolSeasonIndex(prediction_date)

    latest_rows["year_feat"] = prediction_date.year
    latest_rows["month_cyc_feat"] = prediction_date.month
    latest_rows["day_cyc_feat"] = prediction_date.day
    latest_rows["dow_cyc_feat"] = prediction_date.weekday()
    latest_rows["doy_feat"] = prediction_date.timetuple().tm_yday
    latest_rows["quarter_feat"] = ((prediction_date.month - 1) // 3) + 1

    for item_id in latest_rows["itemId"].values:
        hist = combined_df[combined_df["itemId"] == item_id]

        FeatureBuilders.compute_frequency_features(
            hist, latest_rows, item_id, prediction_date
        )

        FeatureBuilders.compute_habit_features(
            hist, latest_rows, item_id, prediction_date
        )

    if "didBuy_target" in latest_rows.columns:
        latest_rows.drop(columns=["didBuy_target"], inplace=True)

    normalized_pred_df = normalize_features(latest_rows, norm_params)

    feature_cols = [c for c in normalized_pred_df.columns if c.endswith("_norm")]

    x_features = normalized_pred_df[feature_cols].to_numpy(np.float32)
    x_item_idx = normalized_pred_df["itemId"].to_numpy(np.int32)

    return {
        "prediction_df": normalized_pred_df,
        "x_features": x_features,
        "x_item_idx": x_item_idx,
        "feature_cols": feature_cols
    }
###############################################


def RunExperiment(combined_df, modelBuildParams, modelTrainParams, baseDir):
    norm_params = fit_normalization_params(combined_df)
    normalized_df = normalize_features(combined_df, norm_params)

    feature_cols = [c for c in normalized_df.columns if c.endswith("_norm")]
    target_cols = [c for c in normalized_df.columns if c.endswith("_target")]

    if len(target_cols) != 1:
        raise ValueError("Exactly one target column is required")

    target_col = target_cols[0]

    feat_cols_count = len(feature_cols)
    item_count = int(normalized_df["itemId"].max()) + 1

    model = build_and_compile_model(
        feat_cols_count, item_count, modelBuildParams
    )

    history = train_model(
        model, normalized_df, feature_cols, target_col, modelTrainParams
    )

    pred_input = build_prediction_input_df(
        combined_df, normalized_df["date"].max(), norm_params
    )

    predictions = model.predict(
        [pred_input["x_features"], pred_input["x_item_idx"]]
    )

    prediction_df = pred_input["prediction_df"]
    prediction_df["prediction"] = predictions

    save_experiment(
        model,
        history,
        [combined_df, normalized_df, prediction_df],
        modelBuildParams,
        modelTrainParams,
        item_id_to_idx=None,
        base_dir=baseDir
    )

    return {
        "model": model,
        "history": history,
        "normalized_df": normalized_df,
        "prediction_df": prediction_df,
        "norm_params": norm_params
    }
###############################################


def RunPredictionsOnly(combined_df,model_dir,prediction_date):
    """
    Loads a trained model + artifacts and runs predictions only.
    """
    model = tf.keras.models.load_model(f"{model_dir}/model.keras")

    with open(f"{model_dir}/norm_params.json", "r") as f:
        norm_params = json.load(f)

    pred_input = build_prediction_input_df(
        combined_df=combined_df,
        prediction_date=prediction_date,
        norm_params=norm_params
    )

    predictions = model.predict(
        [pred_input["x_features"], pred_input["x_item_idx"]]
    )

    prediction_df = pred_input["prediction_df"].copy()
    prediction_df["prediction"] = predictions

    return prediction_df
###############################################



In [None]:
modelParamsList = []
# modelParamsList.append({
#     "trainParams": {
#         "epochs": 20
#     },
#     "buildParams": {
#         "embedding_dim": 32,
#         "layers": [1024,768,512,256,128,64,32,16,8],
#         "activation": "relu",
#         "output_activation": "relu",
#         "optimizer": "adam",
#         "learning_rate": 0.01,
#         "loss": "mse",
#         "metrics": ["mae"]
#     }
# })
############
modelParamsList.append({
    "trainParams": {
        "epochs": 40
    },
    "buildParams": {
        "embedding_dim": 12,
        "layers": [1024,512,256,128,64,32],
        "activation": "relu",
        "output_activation": "sigmoid",
        "optimizer": "adam",
        "learning_rate": 0.01,
        "loss": "mse",
        "metrics": ["mae"]
    }
})
############

modelParamsList.append({
    "trainParams": {
        "epochs": 40
    },
    "buildParams": {
        "embedding_dim": 12,
        "layers": [1024,512,256,128,64,32],
        "activation": "relu",
        "output_activation": "linear",
        "optimizer": "adam",
        "learning_rate": 0.01,
        "loss": "mse",
        "metrics": ["mae"]
    }
})


ts = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
for modelParams in modelParamsList:
    print(f"{modelParams['buildParams']['layers']}")
    runExp(combined_df, encoded_df, modelParams["buildParams"], modelParams["trainParams"], f"exp/keras/{ts}")


# baseline = {
#     "trainParams": {
#         "epochs": 30
#     },
#     "buildParams": {
#         "embedding_dim": 64,
#         "layers": [30],
#         "activation": "relu",
#         "output_activation": "sigmoid",
#         "optimizer": "adam",
#         "learning_rate": 0.0001,
#         "loss": "mse",
#         "metrics": ["mae"]
#     }
# }

# paramList = HiddenLayerParamSetBuilder.BuildHiddenLayerSizeSets(baseline, 30, 20, 4096)

# ts = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# for modelParams in paramList:
#     print(f"{modelParams['buildParams']['layers']}")
#     runExp(combined_df, encoded_df, modelParams["buildParams"], modelParams["trainParams"], f"exp/keras/{ts}")



In [None]:


runPredictionOnly(
    "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
    combined_df = combined_df,
    encoded_df = encoded_df,
    predict_date = "12-19-2025",
    baseDir = "pred"
)

runPredictionOnly(
    "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
    combined_df = combined_df,
    encoded_df = encoded_df,
    predict_date = "12-20-2025",
    baseDir = "pred"
)

# runPredictionOnly(
#     "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
#     combined_df = combined_df,
#     encoded_df = encoded_df,
#     predict_date = "12/21/2025",
#     baseDir = "pred"
# )

# runPredictionOnly(
#     "exp\\keras\\nuerons_sizes\\emb32__hl100__ep40",
#     combined_df = combined_df,
#     encoded_df = encoded_df,
#     predict_date = "12/22/2025",
#     baseDir = "pred"
# )

# OLD OLD OLD OLD

In [None]:
# def build_prediction_df(encoded_df, combined_df, predict_date):

#     print("Building Pred DF")

#     if predict_date is None:
#         predict_date = pd.Timestamp.today().normalize()
#     else:
#         predict_date = pd.to_datetime(predict_date).normalize()

   

# ###############################################

# def run_predictions(model, encoded_df, combined_df, feature_cols, predict_date=None):

#     pred_df = build_prediction_df(encoded_df, combined_df, predict_date)
#     pred_df.info()
    
#     print("Running predicitons")
#     featureCols = pred_df[feature_cols].to_numpy(np.float32)
#     indexCol = pred_df["itemIdx"].to_numpy(np.int32)

#     scores = model.predict([featureCols, indexCol], verbose=0).ravel()
#     pred_df["due_intensity"] = scores

#     return pred_df.sort_values("due_intensity", ascending=False).reset_index(drop=True)    

# ###############################################

# def runExp(combined_df, encoded_df, buildParams, trainParams, baseDir, tripDate=None):
#     item_ids = sorted(encoded_df["itemId"].unique())
#     item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")

#     print("Running Exp....")
#     feature_cols = get_feature_cols(encoded_df)

#     model = build_and_compile_model(len(feature_cols), len(item_ids), buildParams)
#     history = train_model(model, encoded_df, feature_cols, trainParams)

#     predictions_df = run_predictions( model, encoded_df, combined_df, feature_cols, predict_date=tripDate)

#     dataframes = {
#         "predictions": predictions_df,
#         "encoded_features": encoded_df,
#         "combined_df": combined_df
#     }

#     save_experiment(model, history, dataframes, buildParams, trainParams, feature_cols, item_id_to_idx, base_dir=baseDir)

# ###############################################

# def runPredictionOnly(modelDir, combined_df, encoded_df, predict_date, baseDir):

#     model = tf.keras.models.load_model(os.path.join(modelDir, "model"))
#     feature_cols = get_feature_cols(encoded_df)
#     item_map_path = os.path.join(modelDir, "item_id_to_idx.json")
#     with open(item_map_path, "r") as f:
#         item_id_to_idx = {int(k): int(v) for k, v in json.load(f).items()}
#     encoded_df = encoded_df.copy()
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
    
#     predictions = run_predictions(model,encoded_df,combined_df,feature_cols,predict_date=predict_date)

#     #exp_dir = os.path.join(baseDir, f"predict_{predict_date}")
#     predict_date_str = pd.to_datetime(predict_date).strftime("%Y-%m-%d")
#     exp_dir = os.path.join(baseDir, f"predict_{predict_date_str}")
#     os.makedirs(exp_dir, exist_ok=True)

#     predictions.to_csv(os.path.join(exp_dir, "predictions.csv"), index=False)

#     print("Saved prediction →", exp_dir)

In [None]:
# import os
# import json
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from tensorflow.keras import layers, models
# from sklearn.model_selection import train_test_split



# def build_and_compile_model(featColsCount, itemCount, params):
#     num_in = layers.Input(shape=(featColsCount,))
#     item_in = layers.Input(shape=(), dtype="int32")

#     emb = layers.Embedding(
#         input_dim=itemCount,
#         output_dim=params["embedding_dim"]
#     )(item_in)

#     x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

#     for units in params["hiddenLayers"]:
#         x = layers.Dense(units, activation="relu")(x)

#     out = layers.Dense(
#         1,
#         activation=params.get("output_activation", "sigmoid")
#     )(x)

#     model = models.Model([num_in, item_in], out)

#     optimizer = tf.keras.optimizers.Adam(
#         learning_rate=params.get("learning_rate", 0.001)
#     )

#     model.compile(
#         optimizer=optimizer,
#         loss=params.get("loss", "mse"),
#         metrics=params.get("metrics", ["mae"])
#     )

#     return model
# ##########################################################################################

# def get_feature_cols(encoded_df):
#     feature_cols = []
#     for c in encoded_df.columns:
#         if c == "due_score":
#             continue
#         if c.endswith("_norm") or c.endswith("_sin") or c.endswith("_cos"):
#             feature_cols.append(c)
#     return feature_cols
# ##########################################################################################

# def train_model(model, encoded_df, feature_cols, params):

#     featureCols = encoded_df[feature_cols].to_numpy(np.float32)
#     indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
#     targetVar = encoded_df["due_score"].to_numpy(np.float32)

#     featureCols_tr, featureCols_te, indexCol_tr, indexCol_te, targetVar_tr, targetVar_te = train_test_split(
#         featureCols, indexCol, targetVar, test_size=0.2, random_state=42
#     )

#     history = model.fit(
#         [featureCols_tr, indexCol_tr],
#         targetVar_tr,
#         validation_split=params["validation_split"],
#         epochs=params["epochs"],
#         batch_size=params["batch_size"],
#         verbose=1
#     )

#     return history
# ##########################################################################################

# def build_prediction_df(encoded_df, combined_df, predict_date):

#     if predict_date is None:
#         predict_date = pd.Timestamp.today().normalize()
#     else:
#         predict_date = pd.to_datetime(predict_date).normalize()

#     last_trip_date_by_item = (combined_df.sort_values("date").groupby("itemId")["date"].last())

#     last_purchase_date_by_item = (combined_df[combined_df["didBuy"] == 1].sort_values("date").groupby("itemId")["date"].last() )

#     item_lookup = (combined_df[["itemId", "item"]].drop_duplicates().set_index("itemId")["item"].to_dict())

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1].copy()
#         row = last.to_dict()

#         row["itemId"] = itemId
#         row["item"] = item_lookup.get(itemId, "UNKNOWN")

#         last_trip_date = pd.to_datetime(last_trip_date_by_item.loc[itemId]).normalize()
#         row["daysSinceLastTrip_norm"] = (predict_date - last_trip_date).days

#         if itemId in last_purchase_date_by_item.index:
#             last_purchase_date = pd.to_datetime(last_purchase_date_by_item.loc[itemId]).normalize()
#             row["daysSinceLastPurchase_norm"] = (predict_date - last_purchase_date).days

#         row["daysUntilNextHoliday_norm"] = HolidayFeatures.daysUntilNextHoliday(predict_date)
#         row["daysSinceLastHoliday_norm"] = HolidayFeatures.daysSinceLastHoliday(predict_date)
#         row["holidayProximityIndex_norm"] = HolidayFeatures.holidayProximityIndex(predict_date)
#         row["daysUntilSchoolStart_norm"] = HolidayFeatures.daysUntilSchoolStart(predict_date)
#         row["daysUntilSchoolEnd_norm"] = HolidayFeatures.daysUntilSchoolEnd(predict_date)
#         row["schoolSeasonIndex_norm"] = HolidayFeatures.schoolSeasonIndex(predict_date)

#         row["year_norm"] = float(predict_date.year)
#         row["day_norm"] = float(predict_date.day)
#         row["quarter_norm"] = float(predict_date.quarter)

#         row["dow_sin"], row["dow_cos"] = TemporalFeatures.encode_sin_cos(predict_date.weekday(), 7)
#         row["month_sin"], row["month_cos"] = TemporalFeatures.encode_sin_cos(predict_date.month, 12)
#         row["doy_sin"], row["doy_cos"] = TemporalFeatures.encode_sin_cos(predict_date.dayofyear, 365)

#         rows.append(row)

#     return pd.DataFrame(rows)
# ##########################################################################################

# def run_predictions(model, encoded_df, combined_df, feature_cols, predict_date=None):

#     pred_df = build_prediction_df(encoded_df, combined_df, predict_date)

#     featureCols = pred_df[feature_cols].to_numpy(np.float32)
#     indexCol = pred_df["itemIdx"].to_numpy(np.int32)

#     scores = model.predict([featureCols, indexCol], verbose=0).ravel()
#     pred_df["due_intensity"] = scores

#     return (
#         pred_df[["itemId", "item", "due_intensity"]]
#         .sort_values("due_intensity", ascending=False)
#         .reset_index(drop=True)
#     )
# ##########################################################################################

# def runExp(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir, tripDate=None):
#     #
#     item_ids = sorted(encoded_df["itemId"].unique())
#     item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
#     itemCount = len(item_ids)
#     #
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]
#     featColsCount = len(numeric_cols)
#     #
#     model = build_and_compile_model(featColsCount, itemCount, buildParams)
#     #
#     history = train_model(model, encoded_df, trainParams)
#     #
#     predictions = run_predictions(
#         model,
#         encoded_df,
#         combined_df,
#         feature_stats,
#         predict_date=tripDate
#     )
#     #
#     dataframes = {
#         "predictions": predictions,
#         "encoded_features": encoded_df,
#         "combined_df": combined_df
#     }
#     save_experiment(model,history, dataframes, buildParams,trainParams,
#         numeric_cols,
#         item_id_to_idx,
#         base_dir=baseDir
#     )
# ####################################################################################################

In [None]:
# # ---------------- ENTRY POINT ----------------
# trainParams = {
#     "loss": "mse",
#     "optimizer": "adam",
#     "learning_rate": 0.0001,
#     "metrics": ["mae"],
#     "epochs": 40,
#     "batch_size": 32,
#     "validation_split": 0.1
# }

# buildParams_neurons_sigmoid = {
#     "embedding_dim": 32,
#     "hiddenLayers": [1],
#     "output_activation": "sigmoid"
# }

# paramSets = HiddenLayerParamSetBuilder.BuildHiddenLayerSizeSets(buildParams_neurons_sigmoid, 10, 5, 512)

# for eachBuildParams in paramSets:
#     print(f"{eachBuildParams['hiddenLayers']}")
#     runExp(combined_df, encoded_df, eachBuildParams, trainParams, "exp/keras/nuerons_sizes")


In [None]:
# def export_df(dataframes, dir):
#     for name, df in dataframes.items():
#         csv_path = os.path.join(dir, f"{name}.csv")
#         df.to_csv(csv_path, index=True)
# #
# def build_and_compile_model(featColsCount, itemCount, params):
#     num_in = layers.Input(shape=(featColsCount,))
#     item_in = layers.Input(shape=(), dtype="int32")

#     emb = layers.Embedding(
#         input_dim=itemCount,
#         output_dim=params["embedding_dim"]
#     )(item_in)

#     x = layers.Concatenate()([num_in, layers.Flatten()(emb)])

#     for units in params["hiddenLayers"]:
#         x = layers.Dense(units, activation="relu")(x)

#     out = layers.Dense(
#         1,
#         activation=params.get("output_activation", "sigmoid")
#     )(x)

#     model = models.Model([num_in, item_in], out)

#     optimizer_name = params.get("optimizer", "adam")
#     learning_rate = params.get("learning_rate", 0.001)

#     if optimizer_name == "adam":
#         optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     else:
#         optimizer = optimizer_name

#     model.compile(
#         optimizer=optimizer,
#         loss=params.get("loss", "mse"),
#         metrics=params.get("metrics", ["mae"])
#     )

#     return model
# ##########################################################################################

# def train_model(model, encoded_df, params):
   
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     featureCols = encoded_df[numeric_cols].to_numpy(np.float32)
#     indexCol = encoded_df["itemIdx"].to_numpy(np.int32)
#     targetVar  = encoded_df["due_score"].to_numpy(np.float32)

#     featuresCols_train, featuresCols_test, indexCol_train, indexCol_test, targetVar_tr, targetVar_te = train_test_split(featureCols, indexCol, targetVar, test_size=0.2, random_state=42)

#     history = model.fit(
#         [featuresCols_train, indexCol_train],
#         targetVar_tr,
#         validation_split=0.1,
#         epochs=params["epochs"],
#         batch_size=32,
#         verbose=1
#     )

#     return history
# ##########################################################################################

# def build_prediction_df(encoded_df, combined_df, feature_stats, predict_date):

#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     last_trip_date_by_item = (combined_df.sort_values("date").groupby("itemId")["date"].last())

#     last_purchase_date_by_item = (combined_df[combined_df["didBuy"] == 1].sort_values("date").groupby("itemId")["date"].last())

#     item_lookup = (combined_df[["itemId", "item"]].drop_duplicates().set_index("itemId")["item"].to_dict())

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1]

#         last_trip_date = pd.to_datetime(
#             last_trip_date_by_item.loc[itemId]
#         ).normalize()

#         last_purchase_date = pd.to_datetime(
#             last_purchase_date_by_item.loc[itemId]
#         ).normalize()

#         row = {
#             "itemId": itemId,
#             "item": item_lookup.get(itemId, "UNKNOWN"),
#             "itemIdx": int(last["itemIdx"])
#         }

#         # carry-forward observed features
#         for col in numeric_cols:
#             if (
#                 "temp_" in col or
#                 "feelsLike_" in col or
#                 "dew_" in col or
#                 "humidity_" in col or
#                 "precip_" in col or
#                 "freq_" in col or
#                 "habit" in col or
#                 "avgDaysBetweenPurchases" in col
#             ):
#                 row[col] = last[col]

#         raw_updates = {
#             "daysSinceLastTrip": (predict_date - last_trip_date).days,
#             "daysSinceLastPurchase": (predict_date - last_purchase_date).days,
#             "daysUntilNextHoliday": HolidayFeatures.daysUntilNextHoliday(predict_date),
#             "daysSinceLastHoliday": HolidayFeatures.daysSinceLastHoliday(predict_date),
#             "holidayProximityIndex": HolidayFeatures.holidayProximityIndex(predict_date),
#             "daysUntilSchoolStart": HolidayFeatures.daysUntilSchoolStart(predict_date),
#             "daysUntilSchoolEnd": HolidayFeatures.daysUntilSchoolEnd(predict_date),
#             "schoolSeasonIndex": HolidayFeatures.schoolSeasonIndex(predict_date),
#             "year": predict_date.year,
#             "day": predict_date.day,
#             "quarter": predict_date.quarter,
#             "dow_sin": np.sin(2 * np.pi * predict_date.weekday() / 7),
#             "dow_cos": np.cos(2 * np.pi * predict_date.weekday() / 7),
#             "month_sin": np.sin(2 * np.pi * predict_date.month / 12),
#             "month_cos": np.cos(2 * np.pi * predict_date.month / 12),
#             "doy_sin": np.sin(2 * np.pi * predict_date.dayofyear / 365),
#             "doy_cos": np.cos(2 * np.pi * predict_date.dayofyear / 365),
#         }

#         for raw, val in raw_updates.items():
#             norm_col = raw + "_norm"
#             if norm_col in numeric_cols and raw in feature_stats:
#                 stats = feature_stats[raw]
#                 row[norm_col] = (val - stats["mean"]) / stats["std"]

#         rows.append(row)

#     return pd.DataFrame(rows)
#     ######################################################################################
    
# def run_predictions(model, encoded_df, combined_df, feature_stats, predict_date=None):

#         if predict_date is None:
#             predict_date = pd.Timestamp.today().normalize()
#         else:
#             predict_date = pd.to_datetime(predict_date).normalize()
    
#         pred_df = build_prediction_df(
#             encoded_df, combined_df, feature_stats, predict_date
#         )
    
#         numeric_cols = [
#             c for c in pred_df.columns
#             if c.endswith("_norm")
#         ]
    
#         featureCols = pred_df[numeric_cols].to_numpy(np.float32)
#         indexCol = pred_df["itemIdx"].to_numpy(np.int32)
    
#         scores = model.predict([featureCols, indexCol], verbose=0).ravel()
#         pred_df["due_intensity"] = scores
    
#         return (
#             pred_df[["itemId", "item", "due_intensity"]]
#             .sort_values("due_intensity", ascending=False)
#             .reset_index(drop=True)
#         )
# #####################################################################################


# def BuildParamSets( baseline_params, property_name, start, step, stop):
#     """
#     Creates multiple fully independent parameter dictionaries by varying one property.
#     Each iteration produces a brand-new baseline object.
#     """
#     import copy
#     results = []

#     value = start
#     while value <= stop:
#         params_copy = copy.deepcopy(baseline_params)
#         params_copy[property_name] = value
#         results.append(params_copy)
#         value += step

#     return results
# ###############################################################################

# def runExp(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir):
#     #
#     # item index
#     item_ids = sorted(encoded_df["itemId"].unique())
#     item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
#     encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
#     itemCount = len(item_ids)
#     #
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]
#     featColsCount = len(numeric_cols)
#     #
#     model = build_and_compile_model(featColsCount, itemCount, buildParams)
#     #
#     history = train_model(model, encoded_df, trainParams)
#     #
              
#     predictions = run_predictions(model, encoded_df, combined_df, feature_stats)
#     # 
#     dataframes = {
#         "predictions": predictions,
#         "encoded_features": encoded_df,
#         "combined_df": combined_df
#     }
#     save_experiment(model, history, dataframes, buildParams, trainParams, numeric_cols, item_id_to_idx, base_dir= baseDir)
# ################################################################################################################################

# import multiprocessing as mp
# def run_param_sets_multiprocess(buildParamsSets, trainParams, max_parallel, feature_stats, combined_df,encoded_df, baseDir ):
#     #
#     processes = []

#     for buildParams in buildParamsSets:
#         p = mp.Process(
#             target=runExp,
#             args=(feature_stats, combined_df, encoded_df, buildParams, trainParams, baseDir)
#         )
#         p.start()
#         processes.append(p)

#         # limit concurrency
#         if len(processes) >= max_parallel:
#             for proc in processes:
#                 proc.join()
#             processes = []

#     # wait for remaining
#     for proc in processes:
#         proc.join()
# ################################################################################################################################

In [None]:
# buildParams_embeddingsTest = {
#     "embedding_dim": 1,
#     "hiddenLayers": [512],
#     "output_activation": "sigmoid"
# }

# # buildParams_embeddingsTest_relu = {
# #     "embedding_dim": 1,
# #     "hiddenLayers": [1024],
# #     "output_activation": "relu"
# # }


# trainParams = {
#     "loss": "mse",
#     "optimizer": "adam",
#     "learning_rate": 0.0001,
#     "metrics": ["mae"],
#     "epochs": 40,
#     "batch_size": 32,
#     "validation_split": 0.1
# }

# # build sets
# paramSets = BuildParamSets(buildParams_embeddingsTest, "embedding_dim", 33, 2, 64)
# # run
# run_param_sets_multiprocess(paramSets, trainParams, 4, feature_stats, combined_df,encoded_df, "exp_mp")
# #paramSets_embeddingeTest_relu = BuildParamSets(buildParams_embeddingsTest_relu, "embedding_dim", 1, 2, 32)

    

In [None]:



# def run_predictions( model, encoded_df, combined_df, feature_stats, predict_date=None):
#     """
#     Build one prediction row per item using:
#     - latest encoded feature state (encoded_df)
#     - raw timeline + names (combined_df)
#     - recomputed calendar features at predict_date
#     """

#     if predict_date is None:
#         predict_date = pd.Timestamp.today().normalize()
#     else:
#         predict_date = pd.to_datetime(predict_date).normalize()

#     # --------------------------------------------------------
#     # Discover numeric features (single source: encoded_df)
#     # --------------------------------------------------------
#     numeric_cols = [
#         c for c in encoded_df.columns
#         if c.endswith("_norm") and c != "due_score"
#     ]

#     # --------------------------------------------------------
#     # Lookups from combined_df (single source of truth)
#     # --------------------------------------------------------
#     last_date_by_item = (
#         combined_df
#         .sort_values("date")
#         .groupby("itemId")["date"]
#         .last()
#     )

#     item_lookup = (
#         combined_df[["itemId", "item"]]
#         .drop_duplicates()
#         .set_index("itemId")["item"]
#         .to_dict()
#     )

#     rows = []

#     for itemId, hist in encoded_df.groupby("itemId"):
#         last = hist.iloc[-1]
#         last_date = pd.to_datetime(last_date_by_item.loc[itemId]).normalize()

#         row = {
#             "itemId": itemId,
#             "item": item_lookup.get(itemId, "UNKNOWN"),
#             "itemIdx": int(last["itemIdx"])
#         }

#         # ----------------------------------------------------
#         # Copy model-stable numeric features (already normalized)
#         # ----------------------------------------------------
#         for col in numeric_cols:
#             row[col] = last[col]

#         # ----------------------------------------------------
#         # Recompute DATE-SENSITIVE features
#         # ----------------------------------------------------
#         raw_updates = {
#             "daysSinceLastPurchase": (predict_date - last_date).days,
#             "daysUntilNextHoliday": HolidayFeatures.daysUntilNextHoliday(predict_date),
#             "daysSinceLastHoliday": HolidayFeatures.daysSinceLastHoliday(predict_date),
#             "holidayProximityIndex": HolidayFeatures.holidayProximityIndex(predict_date),
#             "daysUntilSchoolStart": HolidayFeatures.daysUntilSchoolStart(predict_date),
#             "daysUntilSchoolEnd": HolidayFeatures.daysUntilSchoolEnd(predict_date),
#             "schoolSeasonIndex": HolidayFeatures.schoolSeasonIndex(predict_date),
#             "year": predict_date.year,
#             "day": predict_date.day,
#             "quarter": predict_date.quarter
#         }

  
#         # ----------------------------------------------------
#         # Normalize recomputed features
#         # ----------------------------------------------------
#         for raw, val in raw_updates.items():
#             norm_col = raw + "_norm"
#             if norm_col in numeric_cols and raw in feature_stats:
#                 stats = feature_stats[raw]
#                 row[norm_col] = (val - stats["mean"]) / stats["std"]

#         rows.append(row)

#     pred_df = pd.DataFrame(rows)

#     Xn = pred_df[numeric_cols].to_numpy(np.float32)
#     Xi = pred_df["itemIdx"].to_numpy(np.int32)

#     scores = model.predict([Xn, Xi], verbose=0).ravel()

#     pred_df["due_intensity"] = scores

#     return (
#         pred_df[["itemId", "item", "due_intensity"]]
#         .sort_values("due_intensity", ascending=False)
#         .reset_index(drop=True)
#      )
# ###############################################################################



# tf.keras.backend.clear_session()


# # ------------------------------------------------------------
# # ENSURE itemIdx
# # ------------------------------------------------------------
# item_ids = sorted(encoded_df["itemId"].unique())
# item_id_to_idx = {iid: i for i, iid in enumerate(item_ids)}
# encoded_df["itemIdx"] = encoded_df["itemId"].map(item_id_to_idx).astype("int32")
# NUM_ITEMS = len(item_ids)

# # ------------------------------------------------------------
# # FEATURES / TARGET
# # ------------------------------------------------------------
# numeric_cols = [
#     c for c in encoded_df.columns
#     if c.endswith("_norm") and c != "due_score"
# ]

# Xn = encoded_df[numeric_cols].to_numpy(np.float32)
# Xi = encoded_df["itemIdx"].to_numpy(np.int32)
# y  = encoded_df["due_score"].to_numpy(np.float32)

# # ------------------------------------------------------------
# # SPLIT
# # ------------------------------------------------------------
# Xn_tr, Xn_te, Xi_tr, Xi_te, y_tr, y_te = train_test_split(
#     Xn, Xi, y, test_size=0.2, random_state=42
# )

# # ------------------------------------------------------------
# # MODEL
# # ------------------------------------------------------------
# num_in = layers.Input(shape=(Xn_tr.shape[1],))
# itm_in = layers.Input(shape=(), dtype="int32")

# emb = layers.Embedding(NUM_ITEMS, 64)(itm_in)
# emb = layers.Flatten()(emb)

# x = layers.Concatenate()([num_in, emb])
# x = layers.Dense(4096, activation="relu")(x)
# #x = layers.Dense(2048, activation="relu")(x)
# out = layers.Dense(1, activation="sigmoid")(x)

# model = models.Model([num_in, itm_in], out)
# model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss="mse", metrics=["mae"])

# history = model.fit(
#     [Xn_tr, Xi_tr],
#     y_tr,
#     validation_split=0.1,
#     epochs=10,
#     batch_size=32,
#     verbose=1
# )

# # ------------------------------------------------------------
# # FEATURE STATS (ONLY recomputed features)
# # (NOTE: stats are for *_norm columns since inference writes *_norm)
# # ------------------------------------------------------------
# feature_stats = {}
# RECOMPUTED = [
#     "daysSinceLastPurchase",
#     "daysUntilNextHoliday",
#     "daysSinceLastHoliday",
#     "holidayProximityIndex",
#     "daysUntilSchoolStart",
#     "daysUntilSchoolEnd",
#     "schoolSeasonIndex",
#     "year", "day", "quarter",
#     "daysUntilBirthday_steve", "daysSinceBirthday_steve",
#     "daysUntilBirthday_maggie", "daysSinceBirthday_maggie",
#     "daysUntilBirthday_mil", "daysSinceBirthday_mil",
#     "daysUntilBirthday_angie", "daysSinceBirthday_angie",
# ]

# for raw in RECOMPUTED:
#     col = raw + "_norm"
#     if col in encoded_df.columns:
#         std = encoded_df[col].std()
#         feature_stats[raw] = {
#             "mean": encoded_df[col].mean(),
#             "std": std if std != 0 else 1.0
#         }

