# LightFM Demo v4 — Robust mapping (auto categories + period normalization)


In [None]:
# !pip install lightfm  # uncomment if needed

In [None]:
import numpy as np, pandas as pd, re
from pathlib import Path
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score

DATA = Path('data')
inter = pd.read_csv(DATA/'interactions_lightfm.csv')
users = pd.read_csv(DATA/'user_features.csv')
items = pd.read_csv(DATA/'item_features.csv')
season = pd.read_csv(DATA/'seasonality_features.csv')

FULL_PERIODS = [f'month-{i}' for i in range(12,0,-1)]

def norm_period(v: str) -> str:
    s = str(v).strip().lower().replace('_','-')
    m = re.search(r'(?:month[- ]?)(\d+)$', s)
    if m:
        k = max(1, min(12, int(m.group(1))))
        return f'month-{k}'
    return s

# Normalize IDs
inter['customer_id'] = inter['customer_id'].astype(str).str.strip()
users['customer_id']  = users['customer_id'].astype(str).str.strip()

# Normalize period forms
inter['period'] = inter['period'].map(norm_period)
season['period'] = season['period'].map(norm_period)

# Complete season months
full = pd.DataFrame({'period': FULL_PERIODS})
season = full.merge(season, on='period', how='left')
for col in ['top_merchant','avg_ticket_size_top_merchant','promo_intensity','months_to_eid','months_to_newyear']:
    if col in season.columns:
        season[col] = season[col].ffill().bfill()

# Auto categories from data
cats_inter = inter['merchant_category'].dropna().astype(str).str.strip().unique().tolist()
cats_items = items['merchant_category'].dropna().astype(str).str.strip().unique().tolist()
CATEGORIES = sorted(set(cats_inter) | set(cats_items))
PERIODS = FULL_PERIODS

# Contextualized items
inter['item_ctx'] = inter['merchant_category'].astype(str).str.strip() + '__' + inter['period']

# Build full mapping universes
all_user_ids = sorted(pd.unique(pd.concat([users['customer_id'], inter['customer_id']])))
all_item_ids = [f"{c}__{p}" for c in CATEGORIES for p in PERIODS]

dataset = Dataset()
dataset.fit(users=all_user_ids, items=all_item_ids)

# User features
users_feat = users.copy()
if 'casa_log' not in users_feat.columns and 'casa_balance' in users_feat.columns:
    users_feat['casa_log'] = np.log1p(users_feat['casa_balance'])
users_feat['casa_bin'] = pd.qcut(users_feat['casa_log'], q=5, duplicates='drop').astype(str)
user_features_tokens = [(r['customer_id'], [f"age_band:{r.get('age_band','NA')}", f"years_band:{r.get('years_band','NA')}", f"casa_bin:{r['casa_bin']}"]) for _, r in users_feat.iterrows()]

# Item features
price_map = dict(zip(items['merchant_category'].astype(str), items.get('price_band','Medium').astype(str))) if 'price_band' in items.columns else {}
avg_ticket = dict(zip(items['merchant_category'].astype(str), items.get('avg_ticket_idr', 100000))) if 'avg_ticket_idr' in items.columns else {}
season = season.set_index('period')

def season_tokens(period):
    row = season.loc[period]
    feats = [f"period:{period}"]
    if 'top_merchant' in row: feats.append(f"top_merchant:{row['top_merchant']}")
    if 'months_to_eid' in row: feats.append(f"eid_bucket:{int(min(3, max(1, 1 + (12 - int(row['months_to_eid']))//4 )))}")
    if 'months_to_newyear' in row: feats.append(f"ny_bucket:{int(min(3, max(1, 1 + (12 - int(row['months_to_newyear']))//4 )))}")
    return feats

item_features_tokens = []
for c in CATEGORIES:
    for p in PERIODS:
        feats = [f'cat:{c}', f'price:{price_map.get(c, "Medium")}', f'avg_ticket_bin:{int(np.log10(max(1, int(avg_ticket.get(c, 100000)))))}']
        feats += season_tokens(p)
        item_features_tokens.append((f"{c}__{p}", feats))

all_user_feats = sorted({f for _, fs in user_features_tokens for f in fs})
all_item_feats = sorted({f for _, fs in item_features_tokens for f in fs})
dataset.fit_partial(user_features=all_user_feats, item_features=all_item_feats)

u_features = dataset.build_user_features(user_features_tokens, normalize=False)
i_features = dataset.build_item_features(item_features_tokens, normalize=False)

# Split interactions
train_df = inter[inter['period'] != 'month-1'].copy()
valid_df = inter[inter['period'] == 'month-1'].copy()

train_tuples = list(zip(train_df['customer_id'], train_df['item_ctx'], train_df['weight_ui'].astype(float)))
valid_tuples = list(zip(valid_df['customer_id'], valid_df['item_ctx'], valid_df['weight_ui'].astype(float)))

(interactions_train, _wt_train) = dataset.build_interactions(train_tuples)
(interactions_valid, _wt_valid) = dataset.build_interactions(valid_tuples)

model = LightFM(loss='warp', no_components=64, learning_rate=0.05, random_state=42)
model.fit(interactions_train, user_features=u_features, item_features=i_features, epochs=20, num_threads=4)

val_auc = auc_score(model, interactions_valid, user_features=u_features, item_features=i_features, num_threads=4).mean()
print('Validation AUC (month-1):', float(val_auc))

user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [None]:
# Inference with target month
k = 5
target_month = 2  # change to the month you want (1..12)
CATEGORIES = sorted({ic.split('__',1)[0] for ic in item_id_map.keys()})
current_items = [f"{c}__month-{target_month}" for c in CATEGORIES if f"{c}__month-{target_month}" in item_id_map]

import numpy as np

def topk_for_user(uid):
    if uid not in user_id_map: return []
    uidx = user_id_map[uid]
    idxs = np.array([item_id_map[i] for i in current_items], dtype=np.int32)
    scores = model.predict(uidx, idxs, user_features=u_features, item_features=i_features)
    order = np.argsort(-scores)[:k]
    return [(current_items[i], float(scores[i])) for i in order]

all_users = users['customer_id'].astype(str).tolist()
rows = []
for uid in all_users:
    for rank, (item_ctx, score) in enumerate(topk_for_user(uid), start=1):
        cat, _p = item_ctx.split('__',1)
        rows.append({'customer_id': uid, 'merchant_category': cat, 'period_scored': f'month-{target_month}', 'rank': rank, 'score': score})

out_df = pd.DataFrame(rows)
print('TopK rows:', len(out_df))
from pathlib import Path
out_path = Path('data') / f'topk_month_{target_month}.csv'
out_df.to_csv(out_path, index=False)
print('Saved:', str(out_path))

out_df.head(20)