In [9]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm.notebook import tqdm
from river import reco, metrics
import multiprocessing
from concurrent.futures import ThreadPoolExecutor

# -----------------------------
# 1. Load Data
# -----------------------------
data = pd.read_parquet("final_df.parquet")
clients_df = pd.read_csv("clients_dataset.csv")
stocks_df = pd.read_csv("stocks_dataset.csv")

# -----------------------------
# 2. Preprocess Data (Time-based)
# -----------------------------
data = data.sort_values("TransactionDate").reset_index(drop=True)

for col in ['Category', 'FamilyLevel1', 'FamilyLevel2', 'Brand', 'StoreCountry']:
    if col not in stocks_df.columns:
        stocks_df[col] = "Unknown"
if 'StoreID' not in stocks_df.columns:
    stocks_df['StoreID'] = "0"

data['ClientGender'].fillna('Unknown', inplace=True)
data['DaysSinceLastTransaction'].replace(0, 900, inplace=True)
data['AverageFrequencySoFar'].replace(900, 0, inplace=True)
data['Quarter'] = data['TransactionDate'].dt.quarter
data = data.loc[:, ~data.columns.str.contains('Rolling90Pct')]

# -----------------------------
# 2a. Undersampling top 1000 sold products
# -----------------------------
print("Computing top 1000 products by total quantity sold...")
top_1000_products = (
    data.groupby('ProductID')['Quantity_sold']
    .sum()
    .sort_values(ascending=False)
    .head(1000)
    .index
)
top_1000_products = set(top_1000_products)

# Filter both data and stocks_df to these top 1000 products
data = data[data['ProductID'].isin(top_1000_products)].copy()
stocks_df = stocks_df[stocks_df['ProductID'].isin(top_1000_products)].copy()

print(f"After top-1000 undersampling:\n  - data has {len(data)} rows\n  - stocks_df has {len(stocks_df)} rows")

# -----------------------------
# 3. Create a ProductID->Universe map
# -----------------------------
product_universe_map = data.groupby("ProductID")["Universe"].last().to_dict()

# -----------------------------
# 4. Utility Functions
# -----------------------------
def generate_negative_samples(df, n_neg=10):
    """
    Generate negative samples for implicit feedback data.
    Sceglie fino a n_neg prodotti, tra quelli apparsi in df, che l'utente non ha acquistato.
    """
    all_products = np.array(df['ProductID'].unique())
    neg_samples = []
    grouped = df.groupby("ClientID")["ProductID"].apply(set).to_dict()
    for client_id, pos_products in grouped.items():
        available_neg = np.setdiff1d(all_products, list(pos_products))
        if len(available_neg) < n_neg:
            sampled_neg = np.random.choice(available_neg, n_neg, replace=True)
        else:
            sampled_neg = np.random.choice(available_neg, n_neg, replace=False)
        base_info = df[df["ClientID"] == client_id].iloc[-1].to_dict()
        for neg_product in sampled_neg:
            row_dict = {**base_info, "ProductID": neg_product, "Label": 0}
            neg_samples.append(row_dict)
    return pd.DataFrame(neg_samples)

def save_model_river(river_model, filename):
    with open(filename, "wb") as f:
        pickle.dump(river_model, f)

def load_model_river(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

# For evaluation: generate top-K recommendations for a given client
def generate_top_k_for_client(river_model, client_id, top_k=5):
    scores = []
    for pid in stocks_df["ProductID"]:
        sc = river_model.predict_one((client_id, pid))
        scores.append((pid, sc))
    scores.sort(key=lambda x: x[1], reverse=True)
    return [pid for pid, sc in scores[:top_k]]

def generate_recommendations_day(client_id, day, n_recommendations, river_model):
    # In this implementation, we ignore "day" for scoring and simply use the model to score (client, item) pairs.
    return generate_top_k_for_client(river_model, client_id, top_k=n_recommendations)

# -----------------------------
# 5. Training & Evaluation using River
# -----------------------------
start_day = data['TransactionDate'].min().normalize()
warmup_end = start_day + pd.Timedelta(days=30)

warmup_data = data[data['TransactionDate'] < warmup_end]
if warmup_data.empty:
    raise ValueError("No data available for warm-up.")

# Prepare warm-up positives and negatives
warmup_pos = warmup_data.copy()
warmup_pos["Label"] = 1
warmup_neg = generate_negative_samples(warmup_data, n_neg=10)
warmup_full = pd.concat([warmup_pos, warmup_neg], ignore_index=True)
warmup_full = warmup_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Initialize the River model using BiasedMF (matrix factorization with biases)
model = reco.BiasedMF(
    n_factors=16,
    learn_rate=0.01,
    use_bias=True
)

# Warm-up training
for _, row in tqdm(warmup_full.iterrows(), total=len(warmup_full), desc="Warmup Training"):
    user_id = row["ClientID"]
    item_id = row["ProductID"]
    rating = row["Label"]
    model = model.learn_one((user_id, item_id), rating)

save_model_river(model, "model_initial.pkl")
current_model_file = "model_initial.pkl"
print("Initial model trained (warm-up).")

evaluation_results = []
all_days = pd.date_range(
    start=warmup_end.normalize(),
    end=data['TransactionDate'].max().normalize(),
    freq='W'
)

for day in all_days:
    window_start = day - pd.Timedelta(days=89)
    if window_start < warmup_end:
        window_start = warmup_end
    
    train_subset = data[
        (data['TransactionDate'] >= window_start) &
        (data['TransactionDate'] <= day)
    ]
    if train_subset.empty:
        continue
    
    # Load current model
    model = load_model_river(current_model_file)
    
    day_pos = train_subset.copy()
    day_pos["Label"] = 1
    day_neg = generate_negative_samples(train_subset, n_neg=10)
    day_full = pd.concat([day_pos, day_neg], ignore_index=True)
    day_full = day_full.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    # Update model on this window
    for _, row in tqdm(day_full.iterrows(), total=len(day_full), desc=f"Training until {day.date()}"):
        user_id = row["ClientID"]
        item_id = row["ProductID"]
        rating = row["Label"]
        model = model.learn_one((user_id, item_id), rating)
    
    updated_model_file = f"model_{day.strftime('%Y%m%d')}.pkl"
    save_model_river(model, updated_model_file)
    print(f"Model updated (window up to {day.date()}) and saved as {updated_model_file}")
    current_model_file = updated_model_file
    
    # Evaluate on today's transactions
    day_rows = data[data['TransactionDate'].dt.normalize() == day]
    if day_rows.empty:
        continue

    client_actual = day_rows.groupby("ClientID")["ProductID"].apply(set).to_dict()
    total_clients = len(client_actual)
    
    def evaluate_client(client_id):
        recommended = generate_recommendations_day(client_id, day, n_recommendations=5, river_model=model)
        return 1 if set(recommended).intersection(client_actual[client_id]) else 0

    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        results = list(executor.map(evaluate_client, client_actual.keys()))
    correct_count = sum(results)
    
    day_accuracy = correct_count / total_clients if total_clients else np.nan
    evaluation_results.append({
        "day": day,
        "total_clients": total_clients,
        "correct": correct_count,
        "accuracy": day_accuracy
    })
    print(f"Day {day.date()} -> Accuracy: {day_accuracy:.4f}")

eval_df = pd.DataFrame(evaluation_results)
print("\nDaily evaluation results:")
print(eval_df)
overall_acc = eval_df["correct"].sum() / eval_df["total_clients"].sum()
print("Overall accuracy:", overall_acc)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClientGender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DaysSinceLastTransaction'].replace(0, 900, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

Computing top 1000 products by total quantity sold...
After top-1000 undersampling:
  - data has 484625 rows
  - stocks_df has 3667 rows


TypeError: __init__() got an unexpected keyword argument 'learn_rate'