In [3]:
import pandas as pd
import numpy as np
import xlearn as xl
from tqdm.notebook import tqdm

XLearnLibraryNotFound: Cannot find xlearn Library in the candidate path

In [None]:
# Load your parquet and CSV files
data = pd.read_parquet("final_df.parquet")
clients_df = pd.read_csv("clients_dataset.csv")
stocks_df = pd.read_csv("stocks_dataset.csv")

# Sort by date
data = data.sort_values("TransactionDate").reset_index(drop=True)

# Basic fill/clean
data['ClientGender'].fillna('Unknown', inplace=True)
data['DaysSinceLastTransaction'].replace(0, 900, inplace=True)
data['AverageFrequencySoFar'].replace(900, 0, inplace=True)
data['Quarter'] = data['TransactionDate'].dt.quarter

# Example: restricting to top 1000 products
top_1000_products = (
    data.groupby('ProductID')['Quantity_sold']
    .sum()
    .sort_values(ascending=False)
    .head(1000)
    .index
)
data = data[data['ProductID'].isin(top_1000_products)].copy()
stocks_df = stocks_df[stocks_df['ProductID'].isin(top_1000_products)].copy()

In [None]:
def generate_negative_samples(df, n_neg=10):
    """
    Generate negative samples (label=0) for implicit feedback data
    by randomly picking products the user didn't buy.
    """
    all_products = np.array(df['ProductID'].unique())

    neg_samples = []
    grouped = df.groupby("ClientID")["ProductID"].apply(set).to_dict()
    for client_id, pos_products in grouped.items():
        available_neg = np.setdiff1d(all_products, list(pos_products))
        if len(available_neg) < n_neg:
            sampled_neg = np.random.choice(available_neg, n_neg, replace=True)
        else:
            sampled_neg = np.random.choice(available_neg, n_neg, replace=False)
        # Take one representative row from this user to fill in other feature columns
        base_info = df[df["ClientID"] == client_id].iloc[-1].to_dict()
        for neg_product in sampled_neg:
            row_dict = {**base_info, "ProductID": neg_product, "Label": 0}
            neg_samples.append(row_dict)
    return pd.DataFrame(neg_samples)

In [None]:
from collections import defaultdict

# Step 4.1: Determine the distinct fields
# E.g., we group features into "Client", "Product", "Interaction", "Store"
FIELDS = ["Client", "Product", "Store", "Interaction"]  # as an example

# We'll build a dictionary like: field -> { original_value -> feature_id }
feature_map = {field: {} for field in FIELDS}
last_feature_id = 1  # Start assigning IDs at 1

def get_feature_id(field, val):
    """Return a unique integer ID for the given (field, val)."""
    global last_feature_id
    if val not in feature_map[field]:
        feature_map[field][val] = last_feature_id
        last_feature_id += 1
    return feature_map[field][val]

# Step 4.2: Convert a single row to LibFFM format
def row_to_ffm(row):
    """
    row is a dictionary or Series with your feature columns:
    - Label
    - ClientID, Age, Gender, ...
    - ProductID, Brand, ...
    - ...
    We'll produce a string:  "<label> f1:featId:val f2:featId:val ..."
    """
    label = int(row["Label"])  # 1 or 0

    ffm_parts = [str(label)]

    # ----- Client Field (field=0 in this example) -----
    # We'll treat "ClientID" as a categorical feature
    client_id = get_feature_id("Client", f"ClientID_{row['ClientID']}")
    ffm_parts.append(f"0:{client_id}:1")  # value=1 for this cat. feature

    # If you have numeric features in "Client", like Age
    # You might treat "Age" as a separate "feature" in the "Client" field
    # Or you might keep it in "Interaction" field—this is up to you
    # Example: let's keep Age in "Client" field, convert to float
    age_val = float(row.get("Age", 30))
    age_feat = get_feature_id("Client", "AGE")
    ffm_parts.append(f"0:{age_feat}:{age_val}")

    # Similarly for Gender, ClientCountry, etc. you do:
    gender_id = get_feature_id("Client", f"Gender_{row['ClientGender']}")
    ffm_parts.append(f"0:{gender_id}:1")

    # ----- Product Field (field=1) -----
    product_id = get_feature_id("Product", f"ProductID_{row['ProductID']}")
    ffm_parts.append(f"1:{product_id}:1")

    brand_id = get_feature_id("Product", f"Brand_{row['Brand']}")
    ffm_parts.append(f"1:{brand_id}:1")

    # numeric product_avg_price_order
    avgp_feat = get_feature_id("Product", "product_avg_price_order")
    avgp_val = float(row.get("product_avg_price_order", 0.0))
    ffm_parts.append(f"1:{avgp_feat}:{avgp_val}")

    # ----- Store Field (field=2) -----
    store_id = get_feature_id("Store", f"StoreID_{row['StoreID']}")
    ffm_parts.append(f"2:{store_id}:1")

    # ----- Interaction Field (field=3) -----
    # example: DaysSinceLastTransaction
    days_id = get_feature_id("Interaction", "DaysSinceLastTransaction")
    days_val = float(row.get("DaysSinceLastTransaction", 900))
    ffm_parts.append(f"3:{days_id}:{days_val}")

    # You'd repeat for other numeric columns in "Interaction" (cumulativeSpent, frequency, etc.)

    return " ".join(ffm_parts)

def df_to_libffm(df, output_file):
    """
    Convert the entire DataFrame to LibFFM format and write to output_file.
    This reuses row_to_ffm(...) above.
    """
    with open(output_file, "w") as f:
        for _, row in df.iterrows():
            ffm_line = row_to_ffm(row)
            f.write(ffm_line + "\n")


In [None]:
start_day = data['TransactionDate'].min().normalize()
warmup_end = start_day + pd.Timedelta(days=30)

# 5.1 Warm-up Data
warmup_data = data[data['TransactionDate'] < warmup_end]
warmup_positives = warmup_data.assign(Label=1)
warmup_negatives = generate_negative_samples(warmup_data, n_neg=10)
warmup_full = pd.concat([warmup_positives, warmup_negatives], ignore_index=True)
warmup_full = warmup_full.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

# Convert to LibFFM format
warmup_file = "warmup_data.ffm"
df_to_libffm(warmup_full, warmup_file)

# 5.2 Define and Train
model = xl.create_ffm()  # field-aware factorization machine
model.setTrain(warmup_file)

param = {
    "task": "binary",     # logistic
    "lr": 0.01,           # learning rate
    "lambda": 0.00002,    # reg
    "metric": "auc",      # for example
    "epoch": 5,           # how many passes
    "thread": 8           # multi-threading
}

# Train FFM
model.fit(param, "model_out.bin")


In [None]:
all_days = pd.date_range(
    start=warmup_end.normalize(),
    end=data['TransactionDate'].max().normalize(),
    freq='W'  # or daily, as you like
)

current_model = "model_out.bin"

results = []

for day in all_days:
    window_start = day - pd.Timedelta(days=30)
    if window_start < warmup_end:
        window_start = warmup_end

    train_subset = data[
        (data['TransactionDate'] >= window_start) &
        (data['TransactionDate'] <= day)
    ]
    if train_subset.empty:
        continue

    # Generate positives + negatives
    pos = train_subset.assign(Label=1)
    neg = generate_negative_samples(train_subset, n_neg=10)
    window_full = pd.concat([pos, neg], ignore_index=True)
    window_full = window_full.sample(frac=1, random_state=42).reset_index(drop=True)

    # Convert to FFM format
    train_file = f"train_{day.strftime('%Y%m%d')}.ffm"
    df_to_libffm(window_full, train_file)

    # Create a new xLearn FFM object for the update
    model_update = xl.create_ffm()
    model_update.setTrain(train_file)

    # Resume from current_model
    # xLearn calls this "init_model"
    model_update.setPreModel(current_model)

    update_param = {
        "task": "binary",
        "lr": 0.01,
        "lambda": 0.00002,
        "metric": "auc",
        "epoch": 5,
        "thread": 8,
        "init_model": current_model  # resume
    }

    # Retrain or "continue training"
    model_update.fit(update_param, "temp_model.bin")
    current_model = f"model_{day.strftime('%Y%m%d')}.bin"
    os.rename("temp_model.bin", current_model)

    # (Pseudo) Evaluate on the same day or next day
    # (in practice, you'd convert test set to .ffm, then predict)
    # ...
    day_result = {"day": day, "accuracy": 0.0}  # placeholder
    results.append(day_result)

# Eventually combine results
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
def generate_recommendations_xlearn(client_id, day, n_top=5, model_path="model_out.bin"):
    """
    1. Construct a DataFrame of (client, product) pairs on the given day.
    2. Convert to FFM format.
    3. Use xLearn model to predict.
    4. Sort by predicted score, return top N.
    """

    # Build the test instances
    # e.g., pick relevant products from stocks_df
    # Fill in user features from the 'clients_df' or the last known transaction from 'data'
    # For each product, create one row with label=0 (dummy), but we only need the fields for scoring
    # Then convert to FFM format, call model.predict

    pass  # Implementation is similar to "df_to_libffm" but for inference