In [18]:

from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())


C:\Users\steve\OneDrive - NOLA Business IT\source\repos\grocery-ml


In [19]:
def show_grouped(grouped, rows=10):
    # collect only the daysSinceLastPurchase_* columns
    feature_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]

    for i in range(min(rows, len(grouped))):
        print("Row:", i)
        print("Date:", grouped.iloc[i]["date"])
        print("Time:", grouped.iloc[i]["time"])
        print("Items:", grouped.iloc[i]["item"])
        print("------ daysSinceLastPurchase ------")

        for col in feature_cols:
            print(f"{col}: {grouped.iloc[i][col]}")

        print("-----------------------------------")


def show_encoded(encoded_df, rows=10):
    # Identify columns
    days_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
    weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
    item_cols = [
        c for c in encoded_df.columns 
        if c not in days_cols 
        and c not in weather_cols
        and c not in ["date", "time"]
    ]

    for i in range(min(rows, len(encoded_df))):
        print("Row:", i)
        print("Date:", encoded_df.iloc[i]["date"])
        print("Time:", encoded_df.iloc[i]["time"])

        # Show the items purchased (reverse one-hot)
        purchased_items = []
        row_vals = encoded_df.iloc[i]

        for item in item_cols:
            if row_vals[item] == 1:
                purchased_items.append(item)

        print("Items:", purchased_items)

        print("------ daysSinceLastPurchase ------")
        for col in days_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("------ weather (rolling windows) ------")
        for col in weather_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("-----------------------------------")


def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|" +
        df["qty"].astype(str) + "|" +
        df["youPay"].astype(str) + "|" +
        df["reg"].astype(str) + "|" +
        df["reportedItemsSold"].astype(str) + "|" +
        df["cashier"].astype(str) + "|" +
        df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result


# Weather

In [20]:
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols);
df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime")
df_weather = df_weather.sort_index()
#df_weather.head(10)
df_weather.info()
#print(df_weather.isna().sum())
#print(df_weather[df_weather.isna().any(axis=1)])
## 
df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()
df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])
df_weather.head(5)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9480 entries, 2000-01-01 to 2025-12-14
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   temp       9480 non-null   float64
 1   feelslike  9480 non-null   float64
 2   dew        9480 non-null   float64
 3   humidity   9480 non-null   float64
 4   precip     9480 non-null   float64
dtypes: float64(5)
memory usage: 444.4 KB


Unnamed: 0_level_0,temp_5day_avg,feelsLike_5day_avg,dew_5day_avg,humidity_5day_avg,precip_5day_avg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-01,57.8,57.8,54.5,89.7,0.0
2000-01-02,63.5,63.5,60.05,89.2,0.0
2000-01-03,66.633333,66.633333,63.033333,88.7,0.0
2000-01-04,63.15,62.7,57.15,81.875,0.135
2000-01-05,59.32,58.06,51.76,77.28,0.108


# DF.TRIPS

In [21]:

rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            "manager": result["manager"],
            "cashier": result["cashier"],
            "item": r["item"],
            "qty": r["qty"],
            "reg": r["reg"],
            "youPay": r["youPay"],
            "reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            "qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

df = pd.DataFrame(rows)
df["date"] = pd.to_datetime(df["date"])
df["time"] = df["time"].astype(str)
df = remove_duplicate_receipt_files(df)
df = df.sort_values(by=["date", "time"]).reset_index(drop=True)
df.head(5)


DUP: 2025-02-19 00:00:00 7:20 PM → keep IMG_9734.txt ← drop IMG_9735.txt
DUP: 2025-04-08 00:00:00 11:50 AM → keep IMG_9723.txt ← drop IMG_9724.txt
DUP: 2025-05-15 00:00:00 8:19 PM → keep IMG_9713.txt ← drop IMG_9714.txt
DUP: 2025-08-02 00:00:00 10:29 PM → keep IMG_9693.txt ← drop IMG_9694.txt
DUP: 2025-10-07 00:00:00 6:06 PM → keep IMG_0017.txt ← drop IMG_9669.txt
DUP: 2025-10-14 00:00:00 6:08 PM → keep IMG_0014.txt ← drop IMG_9666.txt
DUP: 2025-10-17 00:00:00 9:23 PM → keep IMG_0012.txt ← drop IMG_9663.txt


Unnamed: 0,source,date,time,manager,cashier,item,qty,reg,youPay,reportedItemsSold,qtyMatchReported
0,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,spaghettios-pasta,2,3.18,2.5,8,True
1,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mars-chocolate,1,6.49,6.49,8,True
2,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mandms-candies,1,1.79,1.79,8,True
3,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mandarins,1,6.99,5.99,8,True
4,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,sparkling-ice-wtr,1,1.29,1.0,8,True


In [22]:
def purchases_per_window(df, item, window_days):
    """
    Returns how many times `item` is purchased per `window_days`.
    """
    item_df = df[df["item"] == item]

    if len(item_df) == 0:
        return 0.0

    total_days = (df["date"].max() - df["date"].min()).days + 1
    total_purchases = len(item_df)

    if total_days == 0:
        return 0.0

    return (total_purchases / total_days) * float(window_days)
#################################################################

# Choose windows you want
#windows = [14, 30, 365]

# Make sure "item" is a column with item names
unique_items = df["item"].unique()

# Initialize empty columns
#df[f"freq_14_day"] = 0.0

# Fill them
for item in unique_items:
        freq = purchases_per_window(df, item, 14)
        df.loc[df["item"] == item, f"freq__14_days"] = freq
        #
        freq = purchases_per_window(df, item, 30)
        df.loc[df["item"] == item, f"freq_30_days"] = freq
        #
        freq = purchases_per_window(df, item, 90)
        df.loc[df["item"] == item, f"freq_90_days"] = freq
        #
        freq = purchases_per_window(df, item, 365)
        df.loc[df["item"] == item, f"freq_365_days"] = freq


df.head(5)

Unnamed: 0,source,date,time,manager,cashier,item,qty,reg,youPay,reportedItemsSold,qtyMatchReported,freq__14_days,freq_30_days,freq_90_days,freq_365_days
0,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,spaghettios-pasta,2,3.18,2.5,8,True,0.445623,0.954907,2.864721,11.618037
1,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mars-chocolate,1,6.49,6.49,8,True,0.037135,0.079576,0.238727,0.96817
2,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mandms-candies,1,1.79,1.79,8,True,0.074271,0.159151,0.477454,1.93634
3,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,mandarins,1,6.99,5.99,8,True,0.037135,0.079576,0.238727,0.96817
4,IMG_9764.txt,2024-11-15,8:13 PM,,STEPHEN,sparkling-ice-wtr,1,1.29,1.0,8,True,1.151194,2.466844,7.400531,30.013263


Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Users\steve\.conda\envs\intercard-ml\Lib\asyncio\events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\steve\.conda\envs\intercard-ml\Lib\asyncio\selector_events.py", line 132, in _read_from_self
    data = self._ssock.recv(4096)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Users\steve\.conda\envs\intercard-ml\Lib\asyncio\events.py", line 89, in _run
    self._context.run(self._callback, *self._args)
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\steve\.conda\envs\intercard-ml\Lib\asyncio\selec

In [None]:
# -------------  Merge weather and trips  ------------

df_merged = df.merge(df_weather, left_on="date",  right_index=True, how="left")

df_merged.to_csv("df_merged.csv", index=False, encoding="utf-8")
df_merged.head(5)


In [6]:
# Transpose / group
grouped = df_merged.groupby(["date", "time"]).agg({
    "item": list,
    "temp_5day_avg": "first",
    "feelsLike_5day_avg": "first",
    "dew_5day_avg": "first",
    "humidity_5day_avg": "first",
    "precip_5day_avg": "first"
}).reset_index()

In [7]:
# ------------------- NEW daysSinceLastTrip -------------------
days_since_trip = []
prev_date = None

for idx in range(len(grouped)):
    current_date = grouped.iloc[idx]["date"]
    if prev_date is None:
        days_since_trip.append(0)
    else:
        days_since_trip.append((current_date - prev_date).days)
    prev_date = current_date

grouped["daysSinceLastTrip"] = days_since_trip
# -------------------------------------------------------------

In [8]:
# unique items
unique_items = set()
for item_list in grouped["item"]:
    for item in item_list:
        unique_items.add(item)
unique_items = sorted(unique_items)

# ---- Build ALL daysSinceLastPurchase_* columns in one pass ----
days_cols_dict = {}

for item in unique_items:
    last_date_seen = None
    values = []

    for idx in range(len(grouped)):
        current_date = grouped.iloc[idx]["date"]
        items_this_trip = grouped.iloc[idx]["item"]

        if item in items_this_trip:
            last_date_seen = current_date

        if last_date_seen is None:
            values.append(0)
        else:
            values.append((current_date - last_date_seen).days)

    days_cols_dict["daysSinceLastPurchase_" + item] = values

# Add all columns in one concat (fixes fragmentation warning)
grouped = pd.concat([grouped, pd.DataFrame(days_cols_dict)], axis=1)

# build index mapping
item_to_index = {}
for i in range(len(unique_items)):
    item_to_index[unique_items[i]] = i


grouped.to_csv("grouped.csv", index=False)


In [9]:
# ============================================================
# 1. BUILD ONE-HOT ITEMS + MERGE WITH GROUPED DATA
# ============================================================

num_items = len(unique_items)
vectors = []

for item_list in grouped["item"]:
    vector = np.zeros(num_items, dtype=np.int32)
    for name in item_list:
        index = item_to_index[name]
        vector[index] = 1
    vectors.append(vector)

encoded_items_df = pd.DataFrame(vectors, columns=unique_items)

# columns from grouped
days_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]
weather_cols = [c for c in grouped.columns if "_5day" in c]
freq_cols = [c for c in encoded_df.columns if c.startswith("freq_")]

encoded_df = pd.concat(
    [
        grouped[["date", "time", "daysSinceLastTrip"]],
        grouped[days_cols],
        grouped[weather_cols],
        grouped[freq_cols],
        encoded_items_df
    ],
    axis=1
)


show_encoded(encoded_df)

# ============================================================
# 2. SCALE WEATHER COLUMNS + daysSinceLastPurchase_* COLUMNS
# ============================================================

purchase_age_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]

numeric_extra_cols = purchase_age_cols + weather_cols

for col in numeric_extra_cols:
    mean = encoded_df[col].mean()
    std = encoded_df[col].std()
    if std == 0:
        std = 1.0
    encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std

# Drop raw unscaled versions
encoded_df = encoded_df.drop(columns=numeric_extra_cols)



# ============================================================
# 3. BUILD DATETIME FEATURES
# ============================================================

encoded_df["dateTime"] = pd.to_datetime(
    encoded_df["date"].astype(str) + " " + encoded_df["time"].astype(str),
    errors="coerce"
)

encoded_df = encoded_df.drop(columns=["date", "time"])

dt = encoded_df["dateTime"]

encoded_df["year"]    = dt.dt.year
encoded_df["month"]   = dt.dt.month
encoded_df["day"]     = dt.dt.day
encoded_df["hour"]    = dt.dt.hour
encoded_df["minute"]  = dt.dt.minute
encoded_df["dow"]     = dt.dt.dayofweek
encoded_df["doy"]     = dt.dt.dayofyear
encoded_df["quarter"] = dt.dt.quarter



# ============================================================
# 4. BUILD CYCLICAL FEATURES
# ============================================================

encoded_df["hour_sin"] = np.sin(2 * np.pi * encoded_df["hour"] / 24.0)
encoded_df["hour_cos"] = np.cos(2 * np.pi * encoded_df["hour"] / 24.0)

encoded_df["minute_sin"] = np.sin(2 * np.pi * encoded_df["minute"] / 60.0)
encoded_df["minute_cos"] = np.cos(2 * np.pi * encoded_df["minute"] / 60.0)

encoded_df["dow_sin"] =   np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"] =   np.cos(2 * np.pi * encoded_df["dow"] / 7.0)

encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)

encoded_df["doy_sin"] =   np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"] =   np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["month", "hour", "minute", "dow", "doy"])



# ============================================================
# 5. SCALE REMAINING NON-CYCLIC TIME FEATURES
# ============================================================

noncyc_cols = ["year", "day", "quarter", "daysSinceLastTrip"]

for col in noncyc_cols:
    mean = encoded_df[col].mean()
    std = encoded_df[col].std()
    if std == 0:
        std = 1.0
    encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std

encoded_df = encoded_df.drop(columns=["year", "day", "quarter", "daysSinceLastTrip"])



# ============================================================
# 6. DROP DATETIME + SAVE
# ============================================================

encoded_df = encoded_df.drop(columns=["dateTime"])
encoded_df.to_csv("encoded.csv", index=False)


Row: 0
Date: 2024-11-15 00:00:00
Time: 8:13 PM
Items: ['coca-cola', 'mandarins', 'mandms-candies', 'mars-chocolate', 'spaghettios-pasta', 'sparkling-ice-wtr']
------ daysSinceLastPurchase ------
daysSinceLastPurchase_-dr-pepper-12pk: 0
daysSinceLastPurchase_-hugbi-pies: 0
daysSinceLastPurchase_-sno-balls-to-go: 0
daysSinceLastPurchase_-sparkling-ice-wtr: 0
daysSinceLastPurchase_-yellow-corn: 0
daysSinceLastPurchase_12cookiecake: 0
daysSinceLastPurchase_15ct-oatmeal-raisi: 0
daysSinceLastPurchase_18ct-variety-mix: 0
daysSinceLastPurchase_4-star-roast-beef: 0
daysSinceLastPurchase_6ct-buttercream-cu: 0
daysSinceLastPurchase_aandh-frsh-escp: 0
daysSinceLastPurchase_aandh-scent-boosters: 0
daysSinceLastPurchase_alm-petit-four: 0
daysSinceLastPurchase_andes-snap-bar: 0
daysSinceLastPurchase_annie-chuns-bowl: 0
daysSinceLastPurchase_apple-jack-donut: 0
daysSinceLastPurchase_arizona-tea: 0
daysSinceLastPurchase_arm-and-hammer-dtgnt: 0
daysSinceLastPurchase_armour-crunchers: 0
daysSinceLastPur

  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_norm"] = (encoded_df[col] - mean) / std
  encoded_df[col + "_nor

# TRAIN !!!

In [10]:
# TIME FEATURES
time_feature_cols = [
    "hour_sin", "hour_cos",
    "minute_sin", "minute_cos",
    "dow_sin", "dow_cos",
    "month_sin", "month_cos",
    "doy_sin", "doy_cos",
    "year_norm", "day_norm", "quarter_norm",
    "daysSinceLastTrip_norm"
]

# AUTO-DETECT WEATHER + PURCHASE AGE NORMALIZED FEATURES
weather_norm_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg_norm")]
purchase_norm_cols = [
    c for c in encoded_df.columns 
    if c.startswith("daysSinceLastPurchase_") and c.endswith("_norm")
]

# FINAL INPUT FEATURE LIST
input_feature_cols = time_feature_cols + weather_norm_cols + purchase_norm_cols

# Define item columns (must be 0/1 binary purchase values)
item_cols = [
    c for c in encoded_df.columns
    if c not in input_feature_cols
    and c not in ["date", "time", "dateTime"]
    and encoded_df[c].dropna().isin([0,1]).all()
]

# keep items >=5 purchases
frequent_items = [col for col in item_cols if encoded_df[col].sum() >= 5]

# Y MUST COME FROM frequent_items
y = encoded_df[frequent_items].to_numpy(dtype=np.float32)

# X MUST COME FROM THE INPUT FEATURE COLS
X = encoded_df[input_feature_cols].to_numpy(dtype=np.float32)


In [11]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# DEFINE THE MODEL
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation="relu"),
    layers.Dense(128, activation="relu"),
    #layers.Dense(128, activation="relu"),
    #layers.Dense(128, activation="relu"),
    layers.Dense(output_dim, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.BinaryAccuracy(threshold=0.5, name="bin_acc"),
        tf.keras.metrics.AUC(curve="ROC", name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ]
)

# CLASS WEIGHTS FOR IMBALANCE
class_weights = {}
for i in range(len(frequent_items)):      # <-- FIXED
    positives = y_train[:, i].sum()
    negatives = len(y_train) - positives

    w = negatives / (positives + 1e-6)

    if w > 50:
        w = 50.0

    class_weights[i] = w

history = model.fit(
    X_train,
    y_train,
    epochs=300,
    batch_size=32,
    validation_split=0.1,
    class_weight=class_weights,
    verbose=1
)


Epoch 1/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 166ms/step - auc: 0.4965 - bin_acc: 0.5439 - loss: 8.0910 - precision: 0.0743 - recall: 0.4341 - val_auc: 0.5571 - val_bin_acc: 0.6895 - val_loss: 0.6040 - val_precision: 0.1395 - val_recall: 0.3636
Epoch 2/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - auc: 0.5329 - bin_acc: 0.7491 - loss: 6.1770 - precision: 0.0850 - recall: 0.2341 - val_auc: 0.5645 - val_bin_acc: 0.8105 - val_loss: 0.5053 - val_precision: 0.1795 - val_recall: 0.2121
Epoch 3/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - auc: 0.5608 - bin_acc: 0.8410 - loss: 4.9269 - precision: 0.1036 - recall: 0.1415 - val_auc: 0.5718 - val_bin_acc: 0.8399 - val_loss: 0.4451 - val_precision: 0.1667 - val_recall: 0.1212
Epoch 4/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - auc: 0.5793 - bin_acc: 0.8801 - loss: 3.9176 - precision: 0.1274 - recall: 0.0976 - val_auc: 0.57

In [12]:
# Get predictions for the test set
y_pred = model.predict(X_test)

# Apply threshold
threshold = 0.5
y_pred_bin = (y_pred >= threshold).astype(int)


#pd.DataFrame(y_pred_bin, columns=frequent_items).head(100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


In [13]:
import numpy as np
import pandas as pd

def predict_next_trip(model, encoded_df, input_feature_cols, frequent_items):
    """
    Build a single input row for prediction using the most recent trip in encoded_df.
    Returns a sorted DataFrame of predicted probabilities.
    """

    # ------------------------------
    # 1. Get the most recent row (latest trip)
    # ------------------------------
    last = encoded_df.iloc[-1]

    # ------------------------------
    # 2. Build a new row using last-known feature values
    # ------------------------------
    x = {}

    for col in input_feature_cols:
        if col in encoded_df.columns:
            x[col] = last[col]
        else:
            # safety: unknown column
            x[col] = 0.0

    # Convert to model input shape
    X_input = np.array([x[col] for col in input_feature_cols], dtype=np.float32)
    X_input = X_input.reshape(1, -1)

    # ------------------------------
    # 3. Predict probabilities
    # ------------------------------
    y_pred = model.predict(X_input)[0]   # shape: (num_items,)

    # ------------------------------
    # 4. Build labeled output table
    # ------------------------------
    result = pd.DataFrame({
        "item": frequent_items,
        "probability": y_pred
    })

    # Sort highest-probability first
    result = result.sort_values(by="probability", ascending=False)

    return result


In [14]:
pred = predict_next_trip(
    model=model,
    encoded_df=encoded_df,
    input_feature_cols=input_feature_cols,
    frequent_items=frequent_items
)

print(pred.head(20))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
                  item  probability
21    lunchables-snack     1.000000
20  lotus-cookie-buttr     1.000000
15          kandl-milk     0.942211
29      smuckers-jelly     0.003075
23      nutella-spread     0.000795
0     annie-chuns-bowl     0.000641
25  peter-pan-pnt-bttr     0.000050
18        kraft-cheese     0.000005
2          bunny-bread     0.000002
6        cocacola-soda     0.000001
16     kleinpeter-milk     0.000000
24  pearl-millng-syrup     0.000000
5       coca-cola-cola     0.000000
31     sno-balls-to-go     0.000000
10  dewafelbakkr-pncks     0.000000
28          seg-turkey     0.000000
7   coffee-mate-creamr     0.000000
13          hugbi-pies     0.000000
3          chisesi-ham     0.000000
4       chobani-yogurt     0.000000
