In [None]:

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

from datetime import datetime

from winn_dixie_recpt_parser import WinnDixieRecptParser 

pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

print(os.getcwd())


In [None]:
def show_grouped(grouped, rows=10):
    # collect only the daysSinceLastPurchase_* columns
    feature_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]

    for i in range(min(rows, len(grouped))):
        print("Row:", i)
        print("Date:", grouped.iloc[i]["date"])
        print("Time:", grouped.iloc[i]["time"])
        print("Items:", grouped.iloc[i]["item"])
        print("------ daysSinceLastPurchase ------")

        for col in feature_cols:
            print(f"{col}: {grouped.iloc[i][col]}")

        print("-----------------------------------")


def show_encoded(encoded_df, rows=10):
    # Identify columns
    days_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
    weather_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
    item_cols = [
        c for c in encoded_df.columns 
        if c not in days_cols 
        and c not in weather_cols
        and c not in ["date", "time"]
    ]

    for i in range(min(rows, len(encoded_df))):
        print("Row:", i)
        print("Date:", encoded_df.iloc[i]["date"])
        print("Time:", encoded_df.iloc[i]["time"])

        # Show the items purchased (reverse one-hot)
        purchased_items = []
        row_vals = encoded_df.iloc[i]

        for item in item_cols:
            if row_vals[item] == 1:
                purchased_items.append(item)

        print("Items:", purchased_items)

        print("------ daysSinceLastPurchase ------")
        for col in days_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("------ weather (rolling windows) ------")
        for col in weather_cols:
            print(f"{col}: {encoded_df.iloc[i][col]}")

        print("-----------------------------------")


def remove_duplicate_receipt_files(df):
    """
    Remove whole source files that contain an identical receipt
    to another file with the same date+time.
    Minimal console output. Resets index at end.
    """

    df["__signature"] = (
        df["date"].astype(str) + "|" +
        df["time"].astype(str) + "|" +
        df["item"].astype(str) + "|" +
        df["qty"].astype(str) + "|" +
        df["youPay"].astype(str) + "|" +
        df["reg"].astype(str) + "|" +
        df["reportedItemsSold"].astype(str) + "|" +
        df["cashier"].astype(str) + "|" +
        df["manager"].astype(str)
    )

    keep_sources = set()

    for (dt_date, dt_time), group in df.groupby(["date", "time"]):

        # Build signature per source
        source_signatures = {}
        for source, rows in group.groupby("source"):
            sig = tuple(sorted(rows["__signature"].tolist()))
            source_signatures[source] = sig

        # signature → list of sources
        signature_groups = {}
        for src, sig in source_signatures.items():
            signature_groups.setdefault(sig, []).append(src)

        # Handle duplicates
        for sig, sources in signature_groups.items():
            if len(sources) == 1:
                keep_sources.add(sources[0])
                continue

            sorted_sources = sorted(sources)
            kept = sorted_sources[0]
            removed = sorted_sources[1:]

            # Minimal output
            print(f"DUP: {dt_date} {dt_time} → keep {kept} ← drop {', '.join(removed)}")

            keep_sources.add(kept)

    # Filter and clean
    result = df[df["source"].isin(keep_sources)].copy()
    result.drop(columns=["__signature"], inplace=True)

    # ✔ Reset index here
    result.reset_index(drop=True, inplace=True)

    return result



def purchases_per_window(df, item, window_days):
    """
    Returns how many times `item` is purchased per `window_days`.
    """
    item_df = df[df["item"] == item]

    if len(item_df) == 0:
        return 0.0

    total_days = (df["date"].max() - df["date"].min()).days + 1
    total_purchases = len(item_df)

    if total_days == 0:
        return 0.0

    return (total_purchases / total_days) * float(window_days)
    #################################################################

In [None]:
def isNearHoliday(d, xDays):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (holidays - d).days
    return (diffs >= 1).any() and (diffs.min() <= xDays)
#########################################################

def isOnHoliday(d):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    return d.normalize() in holidays
#########################################################

def isPostHoliday(d, xDays):
    d = pd.to_datetime(d)
    holidays = USFederalHolidayCalendar().holidays()
    diffs = (d - holidays).days
    return (diffs >= 1).any() and (diffs.min() <= xDays)
#########################################################

def isBeforeBirthday(d, xDays, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)

    this_year_bday = pd.Timestamp(year=d.year, month=bday.month, day=bday.day)

    if d >= this_year_bday:
        return False

    days_before = (this_year_bday - d).days
    return 1 <= days_before <= xDays
#########################################################

def isOnBirthdate(d, bday):
    d = pd.to_datetime(d)
    bday = pd.to_datetime(bday)
    return (d.month == bday.month) and (d.day == bday.day)
#########################################################

def weatherDeviation(actualTemp, avgTemp):
    return actualTemp - avgTemp
    # negative = colder than avg
    # positive = hotter than avg
#########################################################

def isColderThanAvg(actualTemp, avgTemp, threshold):
    return (avgTemp - actualTemp) >= threshold
#########################################################

def isHotterThanAvg(actualTemp, avgTemp, threshold):
    return (actualTemp - avgTemp) >= threshold
#########################################################

def isSchoolSeason(d):
    d = pd.to_datetime(d)

    start = pd.Timestamp(year=d.year, month=8, day=15)   # Aug 15
    end   = pd.Timestamp(year=d.year, month=5, day=31)   # May 31

    # School season spans Aug 15 → Dec 31 AND Jan 1 → May 31
    if start <= d or d <= end:
        return True

    return False
#########################################################

# DF.TRIPS

In [None]:

rows = []

recptParser  = WinnDixieRecptParser();

for p in Path("winndixie rcpts/StevePhone2/pdf/text").glob("*.txt"):
    result = recptParser.parse(p.read_text(encoding="utf-8", errors="ignore"))
    for r in result["items"]:
        rows.append({
            "source": p.name,
            "date": result["date"],
            "time": result["time"],
            "manager": result["manager"],
            "cashier": result["cashier"],
            "item": r["item"],
            "qty": r["qty"],
            "reg": r["reg"],
            "youPay": r["youPay"],
            "reportedItemsSold": result["reported"],
            #"rowsMatchReported": result["validation"]["rowsMatchReported"],
            "qtyMatchReported": result["validation"]["qtyMatchReported"],
        })

df_trips = pd.DataFrame(rows)

df_trips["date"] = pd.to_datetime(df_trips["date"])
df_trips["time"] = df_trips["time"].astype(str)
df_trips = remove_duplicate_receipt_files(df_trips)
df_trips = df_trips.sort_values(by=["date", "time"]).reset_index(drop=True)
df_trips.head(5)


In [None]:
### Frequency

unique_items = df_trips["item"].unique()

for item in unique_items:

    # Compute per-window frequencies for THIS item
    freq_14  = purchases_per_window(df_trips, item, 14)
    freq_30  = purchases_per_window(df_trips, item, 30)
    freq_90  = purchases_per_window(df_trips, item, 90)
    freq_365 = purchases_per_window(df_trips, item, 365)

    # Assign them to item-specific columns
    df_trips.loc[df_trips["item"] == item, f"freq_14_days_{item}"]  = freq_14
    df_trips.loc[df_trips["item"] == item, f"freq_30_days_{item}"]  = freq_30
    df_trips.loc[df_trips["item"] == item, f"freq_90_days_{item}"]  = freq_90
    df_trips.loc[df_trips["item"] == item, f"freq_365_days_{item}"] = freq_365

    days = compute_days_since(df_trips, item)
    df_trips.loc[df_trips["item"] == item, f"daysSinceLastPurchase_{item}"] = days

df_trips.head(10)

### GROUP

In [None]:
# 1. Identify all item-level columns (freq + daysSinceLastPurchase)
freq_cols = [c for c in df_trips.columns if c.startswith("freq_")]
days_cols = [c for c in df_trips.columns if c.startswith("daysSinceLastPurchase_")]

# Combine them
item_level_cols = freq_cols + days_cols

# 2. Build grouped trip-level table
grouped = df_trips.groupby(["date", "time"], as_index=False).agg(
    {
        "item": list,
    }
)


# 3. Aggregate item-level values across all rows of each trip
#    We use max() because each item column only contains a real value
#    for the matching row, and 0 for the others.
item_level_agg = (
    df_trips.groupby(["date", "time"])[item_level_cols]
    .max()
    .reset_index(drop=True)
)


# 4. Add all item-level columns to grouped
grouped[item_level_cols] = item_level_agg



In [None]:
# -------------------  daysSinceLastTrip -------------------
days_since_trip = []
prev_date = None

for idx in range(len(grouped)):
    current_date = grouped.iloc[idx]["date"]
    if prev_date is None:
        days_since_trip.append(0)
    else:
        days_since_trip.append((current_date - prev_date).days)
    prev_date = current_date

grouped["daysSinceLastTrip"] = days_since_trip


In [None]:
grouped["daysUntilNextHoliday"] = grouped["date"].apply(daysUntilNextHoliday)
grouped["daysSinceLastHoliday"] = grouped["date"].apply(daysSinceLastHoliday)
grouped["holidayProximityIndex"] = grouped["date"].apply(holidayProximityIndex)

#grouped["tempDeviation"] = df.apply(lambda row: tempDeviation(row["actualTemp"], row["avgTemp"]), axis=1)
#grouped["humidityDeviation"] = df.apply(lambda row: humidityDeviation(row["actualHumidity"], row["avgHumidity"]), axis=1)
#grouped["precipDeviation"] = df.apply(lambda row: precipDeviation(row["actualPrecip"], row["avgPrecip"]),axis=1)

angieBday = "08-11-1981"
steveBday = "03-05-1980"
maggieBday = "03-03-2016"

grouped["daysUntilBirthday_steve"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, steveBday))
grouped["daysUntilBirthday_angie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, angieBday))
grouped["daysUntilBirthday_maggie"] = grouped["date"].apply(lambda d: daysUntilBirthday(d, maggieBday))

grouped["daysSinceBirthday_steve"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, steveBday))
grouped["daysSinceBirthday_angie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, angieBday))
grouped["daysSinceBirthday_maggie"] = grouped["date"].apply(lambda d: daysSinceBirthday(d, maggieBday))

grouped["daysUntilSchoolStart"] = grouped["date"].apply(daysUntilSchoolStart)
grouped["daysUntilSchoolEnd"]   = grouped["date"].apply(daysUntilSchoolEnd)
grouped["schoolSeasonIndex"]    = grouped["date"].apply(schoolSeasonIndex)


grouped.to_csv("grouped.csv", index=False)

holidayCols = ["daysUntilNextHoliday", "daysSinceLastHoliday", "holidayProximityIndex"]
bdayCols = [c for c in df.columns if c.startswith("daysUntilBirthday_") or c.startswith("daysSinceBirthday_")]
schoolCols = [c for c in df.columns if "School" in c]



In [None]:
weatherCols=["datetime", "temp", "humidity", "feelslike", "dew", "precip"]
df_weather = pd.read_csv("datasets/VisualCrossing-70062 2000-01-01 to 2025-12-14.csv", usecols=weatherCols);
df_weather["datetime"] = pd.to_datetime(df_weather["datetime"])
df_weather = df_weather.set_index("datetime")
df_weather = df_weather.sort_index()
df_weather.info()
df_weather["temp_5day_avg"] = df_weather["temp"].rolling(5, min_periods=1).mean()
df_weather["feelsLike_5day_avg"] = df_weather["feelslike"].rolling(5, min_periods=1).mean()
df_weather["dew_5day_avg"] = df_weather["dew"].rolling(5, min_periods=1).mean()
df_weather["humidity_5day_avg"] = df_weather["humidity"].rolling(5, min_periods=1).mean()
df_weather["precip_5day_avg"] = df_weather["precip"].rolling(5, min_periods=1).mean()
df_weather = df_weather.drop(columns=["temp", "humidity", "feelslike", "dew", "precip"])


df_merged = df_trips.merge(df_weather, left_on="date",  right_index=True, how="left")
df_merged.to_csv("df_merged.csv", index=False, encoding="utf-8")
df_merged.head(5)


## ENCODED_DF

### One hot items

In [None]:
# ============================================================
# 1. BUILD ONE-HOT ITEMS + MERGE WITH GROUPED DATA
# ============================================================

num_items = len(unique_items)
vectors = []

for item_list in grouped["item"]:
    vector = np.zeros(num_items, dtype=np.int32)
    for name in item_list:
        index = item_to_index[name]
        vector[index] = 1
    vectors.append(vector)


In [None]:
# columns from grouped
days_cols = [c for c in grouped.columns if c.startswith("daysSinceLastPurchase_")]
weather_cols = [c for c in grouped.columns if "_5day" in c]
freq_cols = [c for c in df_trips.columns if c.startswith("freq_")]

encoded_items_df = pd.DataFrame(vectors, columns=unique_items)

encoded_df = pd.concat(
    [
        grouped[["date", "time", "daysSinceLastTrip"]],
        grouped[days_cols],
        grouped[weather_cols],
        grouped[freq_cols],
        
        encoded_items_df
    ],
    axis=1
)

def normalizeAndDropCols(df, cols):
    for col in cols:
        mean = df[col].mean()
        std = df[col].std()

        if std == 0:
            std = 1.0

        df[col + "_norm"] = (df[col] - mean) / std

    df = df.drop(columns=cols)
    return df
    #####################################################


cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_")]
encoded_df = normalizeAndDropCols(encoded_df, cols);
##
cols = [c for c in encoded_df.columns if c.startswith("freq_")]
encoded_df = normalizeAndDropCols(encoded_df, cols);
##
cols = [c for c in encoded_df.columns if c.endswith("_5day_avg")]
encoded_df = normalizeAndDropCols(encoded_df, cols);
##


show_encoded(encoded_df)

### BUILD DATETIME FEATURES + Normalize 

In [None]:
# ============================================================
# 3. BUILD DATETIME FEATURES
# ============================================================

encoded_df["dateTime"] = pd.to_datetime(
    encoded_df["date"].astype(str) + " " + encoded_df["time"].astype(str),
    errors="coerce"
)

encoded_df = encoded_df.drop(columns=["date", "time"])

dt = encoded_df["dateTime"]

encoded_df["year"]    = dt.dt.year
encoded_df["month"]   = dt.dt.month
encoded_df["day"]     = dt.dt.day
encoded_df["hour"]    = dt.dt.hour
encoded_df["minute"]  = dt.dt.minute
encoded_df["dow"]     = dt.dt.dayofweek
encoded_df["doy"]     = dt.dt.dayofyear
encoded_df["quarter"] = dt.dt.quarter


# ============================================================
# 4. SINE COSINE on CYCLICAL FEATURES
# ============================================================

encoded_df["hour_sin"] = np.sin(2 * np.pi * encoded_df["hour"] / 24.0)
encoded_df["hour_cos"] = np.cos(2 * np.pi * encoded_df["hour"] / 24.0)

encoded_df["minute_sin"] = np.sin(2 * np.pi * encoded_df["minute"] / 60.0)
encoded_df["minute_cos"] = np.cos(2 * np.pi * encoded_df["minute"] / 60.0)

encoded_df["dow_sin"] =   np.sin(2 * np.pi * encoded_df["dow"] / 7.0)
encoded_df["dow_cos"] =   np.cos(2 * np.pi * encoded_df["dow"] / 7.0)

encoded_df["month_sin"] = np.sin(2 * np.pi * encoded_df["month"] / 12.0)
encoded_df["month_cos"] = np.cos(2 * np.pi * encoded_df["month"] / 12.0)

encoded_df["doy_sin"] =   np.sin(2 * np.pi * encoded_df["doy"] / 365.0)
encoded_df["doy_cos"] =   np.cos(2 * np.pi * encoded_df["doy"] / 365.0)

encoded_df = encoded_df.drop(columns=["month", "hour", "minute", "dow", "doy"])

# ============================================================
# 5. SCALE REMAINING NON-CYCLIC TIME FEATURES
# ============================================================


cols = [c for c in encoded_df.columns if c.startswith("year")]
encoded_df = normalizeAndDropCols(encoded_df, cols);

cols = [c for c in encoded_df.columns if c.startswith("day")]
encoded_df = normalizeAndDropCols(encoded_df, cols);

cols = [c for c in encoded_df.columns if c.startswith("quarter")]
encoded_df = normalizeAndDropCols(encoded_df, cols);

cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastTrip")]
encoded_df = normalizeAndDropCols(encoded_df, cols);


# ============================================================
# 6. DROP DATETIME + SAVE
# ============================================================

encoded_df = encoded_df.drop(columns=["dateTime"])
encoded_df.to_csv("encoded.csv", index=False)

In [None]:
# TIME FEATURES
time_feature_cols = [
    "hour_sin", "hour_cos",
    "minute_sin", "minute_cos",
    "dow_sin", "dow_cos",
    "month_sin", "month_cos",
    "doy_sin", "doy_cos",
    "year_norm", "day_norm", "quarter_norm",
    "daysSinceLastTrip_norm"
]

weather_norm_cols = [c for c in encoded_df.columns if c.endswith("_5day_avg_norm")]
freq_norm_cols = [ c for c in encoded_df.columns if c.startswith("freq_") and not c.endswith("_norm")]
purchase_norm_cols = [c for c in encoded_df.columns if c.startswith("daysSinceLastPurchase_") and c.endswith("_norm")]

# FINAL INPUT FEATURE LIST
input_feature_cols = time_feature_cols + weather_norm_cols + purchase_norm_cols + freq_norm_cols

# Define item columns (must be 0/1 binary purchase values)
item_cols = [
    c for c in encoded_df.columns
    if c not in input_feature_cols 
    and c not in ["date", "time", "dateTime"]
    and encoded_df[c].dropna().isin([0,1]).all()
]

# keep items >=5 purchases
frequent_items = [col for col in item_cols if encoded_df[col].sum() >= 5]

# Y MUST COME FROM frequent_items
y = encoded_df[frequent_items].to_numpy(dtype=np.float32)

# X MUST COME FROM THE INPUT FEATURE COLS
X = encoded_df[input_feature_cols].to_numpy(dtype=np.float32)


# TRAIN !!!

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# DEFINE THE MODEL
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation="relu"),
    layers.Dense(128, activation="relu"),
    #layers.Dense(128, activation="relu"),
    #layers.Dense(128, activation="relu"),
    layers.Dense(output_dim, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.BinaryAccuracy(threshold=0.5, name="bin_acc"),
        tf.keras.metrics.AUC(curve="ROC", name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ]
)

# CLASS WEIGHTS FOR IMBALANCE
class_weights = {}
for i in range(len(frequent_items)):      # <-- FIXED
    positives = y_train[:, i].sum()
    negatives = len(y_train) - positives

    w = negatives / (positives + 1e-6)

    if w > 50:
        w = 50.0

    class_weights[i] = w

history = model.fit(
    X_train,
    y_train,
    epochs=300,
    batch_size=32,
    validation_split=0.1,
    class_weight=class_weights,
    verbose=1
)


In [None]:
# Get predictions for the test set
y_pred = model.predict(X_test)

# Apply threshold
threshold = 0.5
y_pred_bin = (y_pred >= threshold).astype(int)


#pd.DataFrame(y_pred_bin, columns=frequent_items).head(100)

In [None]:

def predict_next_trip(model, encoded_df, input_feature_cols, frequent_items):
    """
    Build a single input row for prediction using the most recent trip in encoded_df.
    Returns a sorted DataFrame of predicted probabilities.
    """

    # ------------------------------
    # 1. Get the most recent row (latest trip)
    # ------------------------------
    last = encoded_df.iloc[-1]

    # ------------------------------
    # 2. Build a new row using last-known feature values
    # ------------------------------
    x = {}

    for col in input_feature_cols:
        if col in encoded_df.columns:
            x[col] = last[col]
        else:
            # safety: unknown column
            x[col] = 0.0

    # Convert to model input shape
    X_input = np.array([x[col] for col in input_feature_cols], dtype=np.float32)
    X_input = X_input.reshape(1, -1)

    # ------------------------------
    # 3. Predict probabilities
    # ------------------------------
    y_pred = model.predict(X_input)[0]   # shape: (num_items,)

    # ------------------------------
    # 4. Build labeled output table
    # ------------------------------
    result = pd.DataFrame({
        "item": frequent_items,
        "probability": y_pred
    })

    # Sort highest-probability first
    result = result.sort_values(by="probability", ascending=False)

    return result


In [None]:
pred = predict_next_trip(
    model=model,
    encoded_df=encoded_df,
    input_feature_cols=input_feature_cols,
    frequent_items=frequent_items
)

print(pred.head(20))