# LightFM Demo v3 — Seasonality-aware, cold-user ready
• Registers ALL customers and ALL (category × period) items
• Completes missing months in seasonality
• Uses weighted interaction tuples (no `weights=` argument)
• `target_month` control & Top-K CSV export for ALL users


In [None]:
# !pip install lightfm  # uncomment if needed

In [None]:
import numpy as np, pandas as pd
from pathlib import Path
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score

DATA = Path('data')
inter = pd.read_csv(DATA/'interactions_lightfm.csv')
users = pd.read_csv(DATA/'user_features.csv')
items = pd.read_csv(DATA/'item_features.csv')
season = pd.read_csv(DATA/'seasonality_features.csv')

# Normalize IDs
inter['customer_id'] = inter['customer_id'].astype(str).str.strip()
users['customer_id']  = users['customer_id'].astype(str).str.strip()

# Contextualized items
inter['item_ctx'] = inter['merchant_category'] + '__' + inter['period']

CATEGORIES = items['merchant_category'].tolist()
PERIODS    = [f'month-{i}' for i in range(12,0,-1)]

# Register ALL users and ALL items (cat×period)
all_user_ids = sorted(pd.unique(pd.concat([users['customer_id'], inter['customer_id']])) )
all_item_ids = [f"{c}__{p}" for c in CATEGORIES for p in PERIODS]

dataset = Dataset()
dataset.fit(users=all_user_ids, items=all_item_ids)

# User features
users_feat = users.copy()
users_feat['casa_bin'] = pd.qcut(users_feat['casa_log'], q=5, duplicates='drop').astype(str)
user_features_tokens = [(r['customer_id'], [f"age_band:{r['age_band']}", f"years_band:{r['years_band']}", f"casa_bin:{r['casa_bin']}"]) for _, r in users_feat.iterrows()]

# Item features (base + seasonality)
price_map = dict(zip(items['merchant_category'], items['price_band']))
avg_ticket = dict(zip(items['merchant_category'], items['avg_ticket_idr']))
season = season.set_index('period')

def season_tokens(period):
    row = season.loc[period]
    return [f"period:{period}", f"top_merchant:{row['top_merchant']}",
            f"eid_bucket:{int(min(3, max(1, 1 + (12 - int(row['months_to_eid']))//4 )))}",
            f"ny_bucket:{int(min(3, max(1, 1 + (12 - int(row['months_to_newyear']))//4 )))}"]

item_features_tokens = []
for c in CATEGORIES:
    for p in PERIODS:
        ic = f"{c}__{p}"
        feats = [f'cat:{c}', f'price:{price_map.get(c, "Medium")}', f'avg_ticket_bin:{int(np.log10(max(1, int(avg_ticket.get(c, 100000)))))}']
        feats += season_tokens(p)
        item_features_tokens.append((ic, feats))

all_user_feats = sorted({f for _, fs in user_features_tokens for f in fs})
all_item_feats = sorted({f for _, fs in item_features_tokens for f in fs})
dataset.fit_partial(user_features=all_user_feats, item_features=all_item_feats)

u_features = dataset.build_user_features(user_features_tokens, normalize=False)
i_features = dataset.build_item_features(item_features_tokens, normalize=False)

# Interactions split: train on month-12..month-2, validate on month-1
train_df = inter[inter['period'] != 'month-1'].copy()
valid_df = inter[inter['period'] == 'month-1'].copy()

train_tuples = list(zip(train_df['customer_id'], train_df['item_ctx'], train_df['weight_ui'].astype(float)))
valid_tuples = list(zip(valid_df['customer_id'], valid_df['item_ctx'], valid_df['weight_ui'].astype(float)))

(interactions_train, _wt_train) = dataset.build_interactions(train_tuples)
(interactions_valid, _wt_valid) = dataset.build_interactions(valid_tuples)

model = LightFM(loss='warp', no_components=64, learning_rate=0.05, random_state=42)
model.fit(interactions_train, user_features=u_features, item_features=i_features, epochs=20, num_threads=4)

val_auc = auc_score(model, interactions_valid, user_features=u_features, item_features=i_features, num_threads=4).mean()
print('Validation AUC (month-1):', float(val_auc))

user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [None]:
# Inference controls
# Choose month (1..12) and K
target_month = 1
k = 5

current_items = [f"{c}__month-{target_month}" for c in CATEGORIES if f"{c}__month-{target_month}" in item_id_map]

import numpy as np

def topk_for_user(uid):
    if uid not in user_id_map: return []
    uidx = user_id_map[uid]
    idxs = np.array([item_id_map[i] for i in current_items], dtype=np.int32)
    scores = model.predict(uidx, idxs, user_features=u_features, item_features=i_features)
    order = np.argsort(-scores)[:k]
    return [(current_items[i], float(scores[i])) for i in order]

# Export Top-K for all users
all_users = users['customer_id'].tolist()
rows = []
for uid in all_users:
    recs = topk_for_user(uid)
    for rank, (item_ctx, score) in enumerate(recs, start=1):
        cat, _period = item_ctx.split('__', 1)
        rows.append({'customer_id': uid, 'merchant_category': cat, 'period_scored': f'month-{target_month}', 'rank': rank, 'score': score})

out_df = pd.DataFrame(rows)
print('TopK rows:', len(out_df))

out_path = Path('data') / f'topk_month_{target_month}.csv'
out_df.to_csv(out_path, index=False)
print('Saved:', str(out_path))

out_df.head(20)