In [5]:
# %%
import pandas as pd
import numpy as np
import os
import subprocess

# -----------------------------
# 1. Load Data
# -----------------------------
# Carica il dataset delle transazioni derivato
data = pd.read_parquet("final_df.parquet")

# Carica la tabella dei clienti (anche per quelli che non hanno mai acquistato)
clients_df = pd.read_csv("clients_dataset.csv")

# Carica i dati degli stocks (prodotti disponibili)
stocks_df = pd.read_csv("stocks_dataset.csv")
# Se mancano alcune colonne, le riempiamo con valori di default
for col in ['Category', 'FamilyLevel1', 'FamilyLevel2', 'Brand']:
    if col not in stocks_df.columns:
        stocks_df[col] = "Unknown"
if 'StoreID' not in stocks_df.columns:
    stocks_df['StoreID'] = "0"


In [6]:

# -----------------------------
# 2. Preprocess Data (Time-based)
# -----------------------------
# Ordina le transazioni in ordine temporale per evitare data leak
data = data.sort_values("TransactionDate").reset_index(drop=True)

# Riempie i missing e rielabora alcune features
data['ClientGender'].fillna('Unknown', inplace=True)
data['DaysSinceLastTransaction'].replace(0, 900, inplace=True)
data['AverageFrequencySoFar'].replace(900, 0, inplace=True)
data['Quarter'] = data['TransactionDate'].dt.quarter


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClientGender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DaysSinceLastTransaction'].replace(0, 900, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [7]:

# -----------------------------
# 3. Funzioni ausiliarie
# -----------------------------
def generate_negative_samples(df, n_neg=5):
    """
    Genera negative samples per i dati passati (df)
    in modo time-based (i dati passati sono già ordinati per tempo).
    """
    all_products = np.array(df['ProductID'].unique())
    neg_samples = []
    # Raggruppa per ClientID usando i dati del giorno
    grouped = df.groupby("ClientID")["ProductID"].apply(set).to_dict()
    for client_id, pos_products in grouped.items():
        available_neg_products = np.setdiff1d(all_products, list(pos_products))
        if len(available_neg_products) < n_neg:
            sampled_neg = np.random.choice(available_neg_products, n_neg, replace=True)
        else:
            sampled_neg = np.random.choice(available_neg_products, n_neg, replace=False)
        # Per il cliente, usa il record più vecchio (essendo ordinati per tempo)
        base_info = df[df["ClientID"] == client_id].iloc[0].to_dict()
        for neg_product in sampled_neg:
            neg_samples.append({**base_info, "ProductID": neg_product, "Label": 0})
    return pd.DataFrame(neg_samples)

def convert_to_vw(row):
    """Converte una riga in formato VW."""
    return (f"{row['Label']} |Client ClientID:{row['ClientID']} Age:{row['Age']:.2f} "
            f"Gender:{row['ClientGender']} Segment:{row['ClientSegment']} Country:{row['ClientCountry']} "
            f"|Product ProductID:{row['ProductID']} Category:{row['Category']} "
            f"FamilyLevel1:{row['FamilyLevel1']} FamilyLevel2:{row['FamilyLevel2']} "
            f"Brand:{row['Brand']} "
            f"|Store StoreID:{row['StoreID']} Country:{row['StoreCountry']} "
            f"|Interaction Quarter:{row['Quarter']} Weekday:{row['Weekday']} "
            f"PercentageMaleProductsSoFar:{row['PercentageMaleProductsSoFar']:.2f}")

def generate_vw_file(df, file_path):
    """Ordina per tempo, converte in formato VW e salva su file."""
    df = df.sort_values("TransactionDate")
    df["vw_format"] = np.vectorize(convert_to_vw)(df.to_dict(orient="records"))
    df["vw_format"].to_csv(file_path, index=False, header=False)

def generate_recommendations_day(client_id, day, n_recommendations, model_file):
    """
    Genera raccomandazioni top-N per un cliente in un dato giorno usando stocks_df.
    Utilizza il modello salvato in 'model_file'.
    """
    day = pd.Timestamp(day).normalize()
    # Recupera il record del cliente:
    if client_id in data["ClientID"].unique():
        client_data = data[data["ClientID"] == client_id].iloc[0]
    else:
        client_profile = clients_df[clients_df["ClientID"] == client_id]
        if client_profile.empty:
            raise ValueError(f"ClientID {client_id} non trovato.")
        client_profile = client_profile.iloc[0].to_dict()
        client_profile.update({
            "Age": client_profile.get("Age", 30) if pd.notnull(client_profile.get("Age")) else 30,
            "ClientGender": client_profile.get("ClientGender", "Unknown"),
            "ClientSegment": client_profile.get("ClientSegment", "UNKNOWN"),
            "ClientCountry": client_profile.get("ClientCountry", "Unknown"),
            "Label": 1
        })
        client_data = pd.Series(client_profile)
    # Limita i candidati ai prodotti disponibili nel paese del cliente, se possibile
    candidate_stocks = stocks_df[stocks_df["StoreCountry"] == client_data["ClientCountry"]]
    if candidate_stocks.empty:
        candidate_stocks = stocks_df.copy()
    client_features = client_data.to_dict()
    client_features["Weekday"] = day.weekday()
    client_features["Quarter"] = day.quarter
    client_features["PercentageMaleProductsSoFar"] = 0.0
    test_instances = []
    for _, prod in candidate_stocks.iterrows():
        instance = client_features.copy()
        instance["ProductID"] = prod["ProductID"]
        instance["Category"] = prod["Category"]
        instance["FamilyLevel1"] = prod["FamilyLevel1"]
        instance["FamilyLevel2"] = prod["FamilyLevel2"]
        instance["Brand"] = prod["Brand"]
        instance["StoreID"] = prod["StoreID"]
        instance["StoreCountry"] = prod["StoreCountry"]
        test_instances.append(instance)
    vw_inputs = [convert_to_vw(pd.Series(inst)) for inst in test_instances]
    temp_test_file = "temp_test.txt"
    with open(temp_test_file, "w") as f:
        f.write("\n".join(vw_inputs))
    predict_command = ["vw", "-t", "-i", model_file, temp_test_file, "-p", "temp_predictions.txt"]
    subprocess.run(predict_command)
    with open("temp_predictions.txt", "r") as f:
        scores = [float(line.strip()) for line in f]
    candidate_products = candidate_stocks["ProductID"].tolist()
    recommendations = sorted(zip(candidate_products, scores), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return recommendations


In [None]:
# -----------------------------
# 4. Inizializzazione e Addestramento Sequenziale
# -----------------------------
# Imposta una finestra di warm-up (es. i primi 10 giorni)
start_day = data['TransactionDate'].min().normalize()
warmup_end = start_day + pd.Timedelta(days=10)
warmup_data = data[data['TransactionDate'] < warmup_end]
if warmup_data.empty:
    raise ValueError("Non ci sono dati per la fase di warm-up.")

# Per il warm-up, genera sia i positivi che i negativi (time-sorted)
warmup_positives = warmup_data.assign(Label=1)
warmup_negatives = generate_negative_samples(warmup_data)
warmup_full = pd.concat([warmup_positives, warmup_negatives], ignore_index=True)
warmup_file = "warmup_data.txt"
generate_vw_file(warmup_full, warmup_file)

# Addestra il modello iniziale con il warm-up (usando più passaggi se desiderato)
initial_model_file = "model_initial.vw"
train_command = [
    "vw", warmup_file,
    "--loss_function", "hinge",
    "--lrqfa", "Client,Product,Store:8",
    "-l", "0.01",
    "--passes", "5",
    "-b", "24",
    "--early_terminate", "3",
    "-f", initial_model_file
]
print("Addestramento modello iniziale (warm-up)...")
subprocess.run(train_command)
current_model = initial_model_file


Addestramento modello iniziale (warm-up)...


FileNotFoundError: [WinError 2] Impossibile trovare il file specificato

In [None]:

# -----------------------------
# 5. Simulazione Online e Valutazione Giornaliera
# -----------------------------
evaluation_results = []
all_days = pd.date_range(start=warmup_end.normalize(), end=data['TransactionDate'].max().normalize(), freq='D')

for day in all_days:
    # Prendi i dati di training del giorno corrente (solo dati fino a quel giorno)
    day_data = data[data['TransactionDate'].dt.normalize() == day]
    if day_data.empty:
        continue
    day_data = day_data.sort_values("TransactionDate")
    day_positives = day_data.assign(Label=1)
    day_negatives = generate_negative_samples(day_data)
    day_full = pd.concat([day_positives, day_negatives], ignore_index=True)
    day_full = day_full.sort_values("TransactionDate")
    day_train_file = f"train_{day.strftime('%Y%m%d')}.txt"
    generate_vw_file(day_full, day_train_file)
    
    # Aggiorna il modello con i dati del giorno corrente usando VW in modalità online
    updated_model_file = f"model_{day.strftime('%Y%m%d')}.vw"
    update_command = [
        "vw", day_train_file,
        "--loss_function", "hinge",
        "--lrqfa", "Client,Product,Store:8",
        "-l", "0.01",
        "--passes", "1",
        "-b", "24",
        "--early_terminate", "3",
        "--initial_regressor", current_model,
        "-f", updated_model_file
    ]
    print(f"Aggiornamento modello per il giorno {day.date()}...")
    subprocess.run(update_command)
    current_model = updated_model_file
    
    # Valutazione: per ogni cliente che ha effettuato transazioni nel giorno corrente,
    # genera le raccomandazioni e verifica se almeno un prodotto acquistato compare tra i top-5.
    day_trans = day_data.copy()
    client_actual = day_trans.groupby("ClientID")["ProductID"].apply(set).to_dict()
    total_clients = len(client_actual)
    correct_count = 0
    for client_id, bought_products in client_actual.items():
        recs = generate_recommendations_day(client_id, day, n_recommendations=5, model_file=current_model)
        rec_product_ids = [prod for prod, score in recs]
        if set(rec_product_ids).intersection(bought_products):
            correct_count += 1
    day_accuracy = correct_count / total_clients if total_clients > 0 else np.nan
    evaluation_results.append({"day": day, "total_clients": total_clients, "correct": correct_count, "accuracy": day_accuracy})
    print(f"Giorno {day.date()} -> Accuracy: {day_accuracy:.2f}")

eval_df = pd.DataFrame(evaluation_results)
print("Risultati valutazione giornaliera:")
print(eval_df)
overall_accuracy = eval_df["correct"].sum() / eval_df["total_clients"].sum()
print("Accuratezza complessiva:", overall_accuracy)
