In [2]:
# ---------- STEP 0: IMPORTS (run this first) ----------
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import pickle, joblib

print("âœ” Libraries loaded successfully!")
print("pandas:", pd.__version__, "| numpy:", np.__version__)




âœ” Libraries loaded successfully!
pandas: 2.3.3 | numpy: 1.26.4


In [13]:
# -------- QUICK RELOAD: Steps 1â€“5 outputs --------

import pandas as pd
import joblib
import pickle

print("Reloading saved objects...")

# 1) Load transactions FAST
transactions = pd.read_csv(
    "data/h-and-m-personalized-fashion-recommendations/transactions_train.csv",
    dtype={"article_id": str}
)
transactions["article_id"] = transactions["article_id"].astype(int)
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

# 2) Load articles & customers (optional for Step 6+)
articles = pd.read_csv("data/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("data/h-and-m-personalized-fashion-recommendations/customers.csv")

print("CSV data loaded.")

# 3) Load user summary (Step 2 output)
user_summary = pd.read_parquet("models/user_summary.parquet")
print("user_summary loaded")

# 4) Load ALS model + encoders (Step 4)
user_encoder = joblib.load("models/user_encoder.joblib")
item_encoder = joblib.load("models/item_encoder.joblib")

with open("models/als_model.pkl", "rb") as f:
    als_model = pickle.load(f)

print("ALS model + encoders loaded")

# 5) Load co-purchase model (Step 5)
co_purchase = joblib.load("models/co_purchase.joblib")
print("Co-purchase model loaded")

print("\nâœ” Reload complete â€” You can continue from Step 6!")


Reloading saved objects...
CSV data loaded.
user_summary loaded
ALS model + encoders loaded
Co-purchase model loaded

âœ” Reload complete â€” You can continue from Step 6!


In [None]:
# ---------- STEP 1: Load Transactions, Articles, Customers ----------

from pathlib import Path
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# File paths (update only if your folder structure is different)
TXN_PATH = "data/h-and-m-personalized-fashion-recommendations/transactions_train.csv"
ART_PATH = "data/h-and-m-personalized-fashion-recommendations/articles.csv"
CUST_PATH = "data/h-and-m-personalized-fashion-recommendations/customers.csv"

print("Loading transactions...")
transactions = pd.read_csv(TXN_PATH, low_memory=False)
print("âœ” transactions loaded:", transactions.shape)

print("Loading articles...")
articles = pd.read_csv(ART_PATH, low_memory=False)
print("âœ” articles loaded:", articles.shape)

print("Loading customers...")
customers = pd.read_csv(CUST_PATH, low_memory=False)
print("âœ” customers loaded:", customers.shape)

# Convert t_dat to datetime
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
print("\nâœ” t_dat converted to datetime.")

# Print useful info
print("\n--- Data Summary ---")
print("Transactions:", transactions.shape)
print("Articles:", articles.shape)
print("Customers:", customers.shape)

print("\nDate range:", transactions['t_dat'].min(), "â†’", transactions['t_dat'].max())

# Approx memory
def mem(df):
    return df.memory_usage(deep=True).sum() / 1024**2

print("\nMemory used:")
print("transactions:", f"{mem(transactions):.2f} MB")
print("articles:", f"{mem(articles):.2f} MB")
print("customers:", f"{mem(customers):.2f} MB")

# Save clean parquet (super fast for reloading later)
clean_dir = DATA_DIR / "clean"
clean_dir.mkdir(exist_ok=True)

transactions.to_parquet(clean_dir / "transactions.parquet", index=False)
articles.to_parquet(clean_dir / "articles.parquet", index=False)
customers.to_parquet(clean_dir / "customers.parquet", index=False)

print(f"\nâœ” Clean parquet files saved to: {clean_dir.resolve()}")
print("\nSTEP 1 DONE âœ”")




In [None]:
# ---------- STEP 2: Popularity + User Summary ----------

print("\nSTEP 2: Building Popularity & User Summary Features...")

# --------------------------------------------------
# 1) Popularity (frequency + recency-weighted)
# --------------------------------------------------

print("Computing popularity scores...")

# Days since purchase (for recency weighting)
max_date = transactions['t_dat'].max()
transactions['recency_days'] = (max_date - transactions['t_dat']).dt.days

# Frequency
pop_freq = transactions.groupby('article_id').size()

# Recency weight = 1 / (1 + recency_days)
transactions['recency_w'] = 1 / (1 + transactions['recency_days'])
pop_recency = transactions.groupby('article_id')['recency_w'].sum()

# Combine to popularity score
popularity = (
    pop_freq.rank(method="dense", ascending=False) * 0.5 +
    pop_recency.rank(method="dense", ascending=False) * 0.5
)

top12_popular = popularity.sort_values().head(12).index.tolist()

print("âœ” Popularity features computed.")
print("Top 12 popular items:", top12_popular)

# --------------------------------------------------
# 2) Build User Summary Features
# --------------------------------------------------

print("\nComputing USER SUMMARY features...")

user_summary = transactions.groupby("customer_id").agg(
    total_txn=('article_id', 'count'),
    avg_price=('price', 'mean'),
    last_purchase=('t_dat', 'max')
)

# recency of user
user_summary['recency_days'] = (max_date - user_summary['last_purchase']).dt.days

print("\nExtracting last 5 items per user (SAFE VERSION)...")

# Sort newest â†’ oldest
tx_sorted = transactions.sort_values(["customer_id", "t_dat"], ascending=[True, False])

# Take top 5 rows per customer
tx_top5 = tx_sorted.groupby("customer_id").head(5)

# Now group safely
last_items = tx_top5.groupby("customer_id")["article_id"].apply(list)

# Assign into user_summary
user_summary["last_5_items"] = last_items

print("âœ” Last 5 items extracted safely.")
print("User summary shape:", user_summary.shape)

print("\nSTEP 2 DONE âœ”")



In [12]:
# ---------- STEP 3: POPULARITY RECOMMENDER + BASELINE MAP@12 ----------

print("\nSTEP 3: Popularity MAP@12 Baseline")

# ----------------------------------------------------------
# 1) Create train/test split by date
# ----------------------------------------------------------

max_date = transactions['t_dat'].max()
test_date = max_date - pd.Timedelta(days=7)   # last 7 days as test

train_tx = transactions[transactions['t_dat'] < test_date]
test_tx  = transactions[transactions['t_dat'] >= test_date]

print("Train date max:", train_tx['t_dat'].max())
print("Test date min:",  test_tx['t_dat'].min())

# ----------------------------------------------------------
# 2) Build ground truth for MAP
# ----------------------------------------------------------

gt = (
    test_tx.groupby('customer_id')['article_id']
    .apply(list)
    .to_dict()
)

print("Ground truth users:", len(gt))

# ----------------------------------------------------------
# 3) Popularity top-12 from **training** only
# ----------------------------------------------------------

pop_train = train_tx.groupby('article_id').size().sort_values(ascending=False)
top12_train = pop_train.head(12).index.tolist()

print("Top 12 popular items (train):", top12_train)

# ----------------------------------------------------------
# 4) MAP@12 implementation
# ----------------------------------------------------------

def apk(actual, predicted, k=12):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1
            score += hits / (i + 1)
    return score / k

def mapk(ground_truth, predictions, k=12):
    return np.mean([apk(ground_truth[u], predictions[u], k) for u in ground_truth])

# ----------------------------------------------------------
# 5) Predict using popularity recommender
# ----------------------------------------------------------

preds = {uid: top12_train for uid in gt.keys()}

baseline_map12 = mapk(gt, preds, k=12)

print(f"\nâœ” Baseline POPULARITY MAP@12 = {baseline_map12:.5f}")

print("\nSTEP 3 DONE âœ”")



STEP 3: Popularity MAP@12 Baseline
Train date max: 2020-09-14 00:00:00
Test date min: 2020-09-15 00:00:00
Ground truth users: 75481
Top 12 popular items (train): [706016001, 706016002, 372860001, 610776002, 759871002, 464297007, 372860002, 610776001, 399223001, 720125001, 706016003, 156231001]

âœ” Baseline POPULARITY MAP@12 = 0.00078

STEP 3 DONE âœ”


In [5]:
# ---------- STEP 4: ALS MODEL (FINAL FIXED + ROBUST VERSION) ----------
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import pickle, joblib, time, os

print("\nSTEP 4: ALS Collaborative Filtering (FINAL FIXED VERSION)")

# -------------------------------------------------------------------
# 0) Safety Checks
# -------------------------------------------------------------------
assert 'transactions' in globals(), "ERROR: Run STEP 1 first."
assert 'user_summary' in globals(), "ERROR: Run STEP 2 first."

# -------------------------------------------------------------------
# 1) Build LabelEncoders for Users & Items
# -------------------------------------------------------------------
print("\nBuilding LabelEncoders...")
t0 = time.time()

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_encoder.fit(transactions["customer_id"])
item_encoder.fit(transactions["article_id"])

n_users = len(user_encoder.classes_)
n_items = len(item_encoder.classes_)

print(f"Users: {n_users:,}, Items: {n_items:,}  (built in {time.time()-t0:.1f}s)")

# -------------------------------------------------------------------
# 2) Build USER Ã— ITEM CSR Matrix (CRITICAL FIX)
# -------------------------------------------------------------------
print("\nBuilding USERâ€“ITEM matrix (correct orientation for ALS)...")

user_idx = user_encoder.transform(transactions["customer_id"])
item_idx = item_encoder.transform(transactions["article_id"])

data = np.ones(len(user_idx), dtype=np.float32)

# âœ”âœ” Correct orientation: rows = users, columns = items
user_item_matrix = csr_matrix(
    (data, (user_idx, item_idx)),
    shape=(n_users, n_items)
)

print("USERâ€“ITEM matrix shape:", user_item_matrix.shape)

# -------------------------------------------------------------------
# 3) Train ALS model (implicit requires USER Ã— ITEM)
# -------------------------------------------------------------------
print("\nTraining ALS model (15 iterations)...")
als_model = AlternatingLeastSquares(
    factors=64,
    regularization=0.1,
    iterations=15,
    use_gpu=False
)

t0 = time.time()
als_model.fit(user_item_matrix)
print(f"âœ” ALS training complete in {time.time()-t0:.1f}s")

# Save factors
user_factors = als_model.user_factors      # (n_users, factors)
item_factors = als_model.item_factors      # (n_items, factors)

# -------------------------------------------------------------------
# 4) SAFE RECOMMENDER FUNCTION â€” NO implicit.recommend()
# -------------------------------------------------------------------
print("\nPreparing safe factor-based recommender...")

def recommend_for_encoded_user(enc_uid, N=12):
    """Return top-N item indices for encoded user."""
    u_vec = user_factors[enc_uid]   # (factors,)
    scores = item_factors.dot(u_vec)   # (n_items,)

    # mask purchased items
    purchased = user_item_matrix[enc_uid].toarray().ravel()
    already = np.where(purchased > 0)[0]
    scores[already] = -np.inf

    # top-N
    top_idx = np.argpartition(-scores, N)[:N]
    top_idx = top_idx[np.argsort(-scores[top_idx])]
    return top_idx

# -------------------------------------------------------------------
# 5) Select sample users for testing
# -------------------------------------------------------------------
print("\nSelecting users with >0 transactions...")

valid_users = user_summary[user_summary["total_txn"] > 0].index.values
valid_users_enc = user_encoder.transform(valid_users)

sample_users_enc = np.random.choice(valid_users_enc, size=5, replace=False)
sample_users_orig = user_encoder.inverse_transform(sample_users_enc)

print("Sample users:", sample_users_orig)

# -------------------------------------------------------------------
# 6) Test ALS Recommendations (SAFE)
# -------------------------------------------------------------------
print("\nTesting ALS recommendations (safe method):")

for enc_uid, orig_uid in zip(sample_users_enc, sample_users_orig):
    top_items_enc = recommend_for_encoded_user(enc_uid, N=12)
    rec_articles = item_encoder.inverse_transform(top_items_enc)

    print(f"User {orig_uid[:10]} â†’ {list(rec_articles)}")

# -------------------------------------------------------------------
# 7) Save Model + Encoders
# -------------------------------------------------------------------
print("\nSaving model and encoders...")

os.makedirs("models", exist_ok=True)
joblib.dump(user_encoder, "models/user_encoder.joblib")
joblib.dump(item_encoder, "models/item_encoder.joblib")

with open("models/als_model.pkl", "wb") as f:
    pickle.dump(als_model, f)

print("\nSTEP 4 DONE âœ” (ALS model + encoders saved)")



STEP 4: ALS Collaborative Filtering (FINAL FIXED VERSION)

Building LabelEncoders...
Users: 1,362,281, Items: 104,547  (built in 10.8s)

Building USERâ€“ITEM matrix (correct orientation for ALS)...
USERâ€“ITEM matrix shape: (1362281, 104547)

Training ALS model (15 iterations)...


  check_blas_config()
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15/15 [01:29<00:00,  5.97s/it]


âœ” ALS training complete in 91.3s

Preparing safe factor-based recommender...

Selecting users with >0 transactions...
Sample users: ['6dea99c7816e2c2fae5f974d025b0af044c3b14f1f9aff0512f23bfae770057e'
 '21cc4c681bed1269fe65b57fb8e024cd64a6054c74a436aff806edb89b897dc5'
 '9ea1305c846b8256a4109e8e05d882de7c01448e95d7a6c44b10e64c9b27928f'
 '47f489c3dfaf7d700afdb7698af3ce4505f180a90c97097566229c8d3f68bd37'
 '2c6b276b22684ff1a0e5eabd1bfe6cf71fbfb0418c9742d6639d68fa71a6636d']

Testing ALS recommendations (safe method):
User 6dea99c781 â†’ [685814001, 685816002, 685816001, 741356002, 598755001, 685813005, 537116001, 598755002, 570002002, 720504001, 575347003, 570002001]
User 21cc4c681b â†’ [684340001, 684341001, 684341002, 684340002, 484398001, 629758005, 689009001, 600886001, 733749001, 564786001, 562245046, 742925003]
User 9ea1305c84 â†’ [562245001, 562245046, 562245050, 562245004, 156231001, 399256005, 706016003, 636323001, 484398001, 562245062, 562245061, 562245064]
User 47f489c3df â†’ [5

In [6]:
# ---------- FAST STEP 5: CO-PURCHASE (vectorized, batched) ----------
import numpy as np
import joblib, time, os
try:
    from tqdm import tqdm
except Exception:
    tqdm = lambda x, **kw: x

print("\nFAST STEP 5: Batched co-purchase (vectorized)")

# Safety checks
assert 'user_item_matrix' in globals(), "user_item_matrix not found â€” run Step 4 first."
assert 'item_encoder' in globals(), "item_encoder not found â€” run Step 4 first."

# Tunable parameters for speed / coverage
TOP_N_PER_ITEM = 20     # how many co-purchased items to keep per item
TOP_M_ITEMS = 1000      # compute co-purchase for top-M popular items (1000 => very fast)
BATCH_SIZE = 64         # number of target items to process per batch
MIN_ITEM_SUPPORT = 3    # skip items with very few purchases

print(f"Parameters: TOP_N_PER_ITEM={TOP_N_PER_ITEM}, TOP_M_ITEMS={TOP_M_ITEMS}, BATCH_SIZE={BATCH_SIZE}, MIN_ITEM_SUPPORT={MIN_ITEM_SUPPORT}")

t0 = time.time()

# 1) item popularity and selection
item_counts = np.array(user_item_matrix.sum(axis=0)).ravel()
n_items = item_counts.shape[0]
popular_idx = np.argsort(-item_counts)
top_m_idx = popular_idx[: min(TOP_M_ITEMS, n_items)]
print("n_items:", n_items, "Computing for top_M:", len(top_m_idx))

# 2) Precompute L2 norms for items (for cosine)
item_sq_sums = np.array(user_item_matrix.power(2).sum(axis=0)).ravel()
item_l2 = np.sqrt(item_sq_sums)
item_l2[item_l2 == 0] = 1.0

# 3) We'll compute in batches: for batch of target items, compute sims = (U^T * U_batch).T / (l2_tgt * l2_all)
co_purchase = {}
global_topN = item_encoder.inverse_transform(popular_idx[:TOP_N_PER_ITEM])

# Process batches
batches = [top_m_idx[i:i+BATCH_SIZE] for i in range(0, len(top_m_idx), BATCH_SIZE)]
for batch in tqdm(batches, desc="batches"):
    # user_item_matrix[:, batch] -> (n_users, batch_size)
    # compute (batch_size, n_items) = (batch_size, n_users) dot (n_users, n_items)
    # do this by transposing the left: (n_users, batch).T dot (n_users, n_items)
    U_batch = user_item_matrix[:, batch]        # shape (n_users, batch)
    sims_batch = (U_batch.T).dot(user_item_matrix).toarray()  # (batch, n_items) dense array

    # normalize per row (batch item) using item_l2
    # denom for each row = item_l2[batch_item] * item_l2 (vector)
    denom = item_l2[batch][:, None] * item_l2[None, :]  # (batch, n_items)
    sims_batch = sims_batch / denom
    # for each target in batch, zero (or -inf) the self cell
    for i_idx, tgt in enumerate(batch):
        sims_batch[i_idx, tgt] = -np.inf

    # for each target row pick top-k
    for i_idx, tgt in enumerate(batch):
        if item_counts[tgt] < MIN_ITEM_SUPPORT:
            co_purchase[item_encoder.inverse_transform([tgt])[0]] = []
            continue

        sims = sims_batch[i_idx]
        if TOP_N_PER_ITEM >= n_items:
            top_idx = np.argsort(-sims)
        else:
            # faster selection
            top_idx = np.argpartition(-sims, TOP_N_PER_ITEM)[:TOP_N_PER_ITEM]
            top_idx = top_idx[np.argsort(-sims[top_idx])]

        top_articles = item_encoder.inverse_transform(top_idx.astype(int))
        co_purchase[item_encoder.inverse_transform([tgt])[0]] = list(top_articles)

# 4) fallback fill for items not computed
for i in range(n_items):
    art = item_encoder.inverse_transform([i])[0]
    if art not in co_purchase:
        fallback = [a for a in global_topN if a != art][:TOP_N_PER_ITEM]
        co_purchase[art] = fallback

# 5) save mapping
os.makedirs("models", exist_ok=True)
joblib.dump(co_purchase, "models/co_purchase.joblib")

print(f"\nFAST co-purchase built and saved to models/co_purchase.joblib in {time.time()-t0:.1f}s")
print("Example (first 5 keys):")
keys = list(co_purchase.keys())[:5]
for k in keys:
    print(k, "->", co_purchase[k][:10])

print("\nFAST STEP 5 DONE âœ”")



FAST STEP 5: Batched co-purchase (vectorized)
Parameters: TOP_N_PER_ITEM=20, TOP_M_ITEMS=1000, BATCH_SIZE=64, MIN_ITEM_SUPPORT=3
n_items: 104547 Computing for top_M: 1000


batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:49<00:00,  3.10s/it]



FAST co-purchase built and saved to models/co_purchase.joblib in 451.8s
Example (first 5 keys):
706016001 -> [706016002, 706016003, 706016015, 706016006, 706016019, 539723001, 673901001, 573085028, 554450001, 399223001]
706016002 -> [706016001, 706016006, 706016003, 706016015, 706016004, 706016038, 539723005, 573085004, 706016019, 706016007]
372860001 -> [372860002, 608776002, 372860024, 575347003, 464297007, 817124001, 653188002, 243937001, 507883009, 717816001]
610776002 -> [610776001, 610776028, 554598001, 610776083, 610776107, 610776040, 610776072, 610776007, 561797002, 864288007]
759871002 -> [759871001, 759871003, 759871025, 759871004, 759871013, 759871011, 759871014, 759871015, 733749001, 408875001]

FAST STEP 5 DONE âœ”


In [10]:
# ---- REBUILD user_summary (FAST SAFE version) ----
import pandas as pd

print("Rebuilding user_summary from transactions...")

# load only required columns
transactions = pd.read_csv(
    "data/h-and-m-personalized-fashion-recommendations/transactions_train.csv",
    dtype={"article_id": str}
)
transactions["article_id"] = transactions["article_id"].astype(int)
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

# --- popularity ---
item_pop = transactions.groupby("article_id").size().sort_values(ascending=False)

# --- user_summary basic ---
user_summary = transactions.groupby("customer_id").size().to_frame("total_txn")

# --- last 5 items per user ---
tx_sorted = transactions.sort_values(["customer_id", "t_dat"], ascending=[True, False])
last_items = (
    tx_sorted.groupby("customer_id")["article_id"]
    .head(5)
    .groupby(tx_sorted["customer_id"])
    .apply(list)
)

user_summary["last_5_items"] = last_items

print("user_summary shape:", user_summary.shape)

# SAVE
import os
os.makedirs("models", exist_ok=True)
user_summary.to_parquet("models/user_summary.parquet")

print("\nâœ” user_summary.parquet saved to models/")


Rebuilding user_summary from transactions...
user_summary shape: (1362281, 2)

âœ” user_summary.parquet saved to models/


In [14]:
# -------- STEP 6 (OPTIMIZED FOR 10K USERS): Candidate Generator --------
import numpy as np
import joblib, pickle
from tqdm import tqdm
from scipy.sparse import csr_matrix

print("\nSTEP 6: Optimized Candidate Generator for 10,000 users...")

# ------------------------------------------------------------------
# 1) Load encoders, ALS model, co-purchase
# ------------------------------------------------------------------
user_encoder = joblib.load("models/user_encoder.joblib")
item_encoder = joblib.load("models/item_encoder.joblib")
co_purchase  = joblib.load("models/co_purchase.joblib")

with open("models/als_model.pkl", "rb") as f:
    als_model = pickle.load(f)

item_factors = als_model.item_factors
user_factors = als_model.user_factors

# ------------------------------------------------------------------
# 2) Rebuild USERâ€“ITEM matrix (for filtering)
# ------------------------------------------------------------------
print("Rebuilding USERâ€“ITEM matrix...")

rows = item_encoder.transform(transactions["article_id"])
cols = user_encoder.transform(transactions["customer_id"])
data = np.ones(len(rows), dtype=np.float32)

item_user_matrix = csr_matrix(
    (data, (rows, cols)),
    shape=(len(item_encoder.classes_), len(user_encoder.classes_))
)

user_item_matrix = item_user_matrix.T
print("User-item matrix shape:", user_item_matrix.shape)


# ------------------------------------------------------------------
# 3) Fast ALS scoring
# ------------------------------------------------------------------
def recommend_for_user(uid_enc, topN=50):
    u = user_factors[uid_enc]             
    scores = item_factors.dot(u)          

    purchased = user_item_matrix.getrow(uid_enc).indices
    if len(purchased) > 0:
        scores[purchased] = -np.inf

    top = np.argpartition(-scores, topN)[:topN]
    top = top[np.argsort(-scores[top])]
    return top


# ------------------------------------------------------------------
# 4) Generate candidates for 10,000 users
# ------------------------------------------------------------------
USER_LIMIT = 10000
print(f"Generating candidates for {USER_LIMIT} users...")

candidates = {}
user_list = user_summary.index[:USER_LIMIT]

for user_id in tqdm(user_list, desc="Users"):

    uid_enc = user_encoder.transform([user_id])[0]

    # ---- ALS candidates ----
    als_top = recommend_for_user(uid_enc, topN=50)
    als_items = item_encoder.inverse_transform(als_top)

    # ---- Co-purchase ----
    last_items = user_summary.loc[user_id, "last_5_items"]
    cop_items = []

    for it in last_items:
        if it in co_purchase:
            cop_items.extend(co_purchase[it][:20])

    cop_items = np.array(cop_items, dtype=np.int64)

    # ---- Merge ----
    merged = np.unique(np.concatenate([als_items, cop_items]))
    candidates[user_id] = merged[:200]    # up to 200 items per user


print("\nSTEP 6 DONE âœ” (10k users complete)")
joblib.dump(candidates, "models/candidates.joblib")
print("Saved â†’ models/candidates.joblib")



STEP 6: Optimized Candidate Generator for 10,000 users...
Rebuilding USERâ€“ITEM matrix...
User-item matrix shape: (1362281, 104547)
Generating candidates for 10000 users...


Users: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [4:06:11<00:00,  1.48s/it]



STEP 6 DONE âœ” (10k users complete)
Saved â†’ models/candidates.joblib


In [15]:
# -------- STEP 7: FAST FEATURE BUILDER (under 1 hour) --------
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm

print("\nSTEP 7: Building training features (FAST MODE)...")

# ---------------------------------------------------------
# 1) Load required files
# ---------------------------------------------------------
print("Loading saved objects...")

user_summary = pd.read_parquet("models/user_summary.parquet")
candidates = joblib.load("models/candidates.joblib")

user_encoder = joblib.load("models/user_encoder.joblib")
item_encoder = joblib.load("models/item_encoder.joblib")

with open("models/als_model.pkl", "rb") as f:
    als_model = pickle.load(f)

item_factors = als_model.item_factors
user_factors = als_model.user_factors

# ---------------------------------------------------------
# 2) Build USERâ€“ITEM matrix (for interaction lookup)
# ---------------------------------------------------------
print("Rebuilding USERâ€“ITEM matrix...")

rows = item_encoder.transform(transactions["article_id"])
cols = user_encoder.transform(transactions["customer_id"])
data = np.ones(len(rows), dtype=np.float32)

item_user_matrix = csr_matrix((data, (rows, cols)),
                              shape=(len(item_encoder.classes_),
                                     len(user_encoder.classes_)))

user_item_matrix = item_user_matrix.T   # (users Ã— items)

# ---------------------------------------------------------
# 3) Fast feature functions
# ---------------------------------------------------------
def sim_user_item_score(uid_enc, item_enc):
    """Cosine similarity between user vector & item vector (very fast)."""
    return np.dot(user_factors[uid_enc], item_factors[item_enc])

def user_interaction_count(uid_enc):
    """How many items user interacted with."""
    return user_item_matrix.getrow(uid_enc).count_nonzero()

def item_popularity(enc_item):
    """How many users bought the item."""
    return item_user_matrix.getrow(enc_item).count_nonzero()

# ---------------------------------------------------------
# 4) Build feature rows
# ---------------------------------------------------------

print("\nGenerating feature rows... (this is the main loop)")

rows = []
counter = 0

for user_id, items in tqdm(candidates.items(), total=len(candidates)):

    uid_enc = user_encoder.transform([user_id])[0]

    # vectorized encode candidate items
    items_enc = item_encoder.transform(items)

    # vectorized feature calculation
    sim_scores = np.dot(item_factors[items_enc], user_factors[uid_enc])

    # popularities
    pops = np.array([item_popularity(i) for i in items_enc])

    # user transaction strength
    u_tx = user_interaction_count(uid_enc)

    # labels: 1 if in last week's purchases
    last_items = set(user_summary.loc[user_id, "last_5_items"])
    labels = np.array([1 if int(it) in last_items else 0 for it in items])

    # build rows
    for i, it in enumerate(items):
        rows.append([
            user_id,
            it,
            sim_scores[i],
            pops[i],
            u_tx,
            labels[i]
        ])

    counter += 1

    # EARLY FINISH OPTION (optional)
    if counter >= 10000:     # 10k users limit (FAST)
        break

# ---------------------------------------------------------
# 5) Convert to DataFrame
# ---------------------------------------------------------
df = pd.DataFrame(rows, columns=[
    "user_id",
    "item_id",
    "als_similarity",
    "item_popularity",
    "user_total_txn",
    "label"
])

print("\nFeature DataFrame shape:", df.shape)

df.to_parquet("models/train_features.parquet")
print("\nâœ” STEP 7 DONE (FAST MODE). Saved â†’ models/train_features.parquet")



STEP 7: Building training features (FAST MODE)...
Loading saved objects...
Rebuilding USERâ€“ITEM matrix...

Generating feature rows... (this is the main loop)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 9999/10000 [3:39:19<00:01,  1.32s/it]



Feature DataFrame shape: (765437, 6)

âœ” STEP 7 DONE (FAST MODE). Saved â†’ models/train_features.parquet


In [16]:
# -------- STEP 8: LIGHTGBM TRAINING (FINAL OPTIMIZED) --------
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

print("\nSTEP 8: Training LightGBM (FINAL optimized)...")

# ----------------------------------------------------
# 1) LOAD FEATURES
# ----------------------------------------------------
df = pd.read_parquet("models/train_features.parquet")
print("Training data:", df.shape)

print("Positive labels:", df["label"].sum())
print("Unique users:", df["user_id"].nunique())

# ----------------------------------------------------
# 2) STRATIFIED SPLIT
# ----------------------------------------------------
train_df, valid_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["label"]  # ensure positives appear in valid
)

print("Train:", train_df.shape, "Valid:", valid_df.shape)

# ----------------------------------------------------
# 3) SELECT FEATURES
# ----------------------------------------------------
feature_cols = [
    "als_similarity",
    "item_popularity",
    "user_total_txn"
]

X_train = train_df[feature_cols]
y_train = train_df["label"]

X_valid = valid_df[feature_cols]
y_valid = valid_df["label"]

# ----------------------------------------------------
# 4) HANDLE EXTREME CLASS IMBALANCE
# ----------------------------------------------------
# If positives extremely small, boost positive weight
pos_rate = y_train.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate if pos_rate > 0 else 1

print("scale_pos_weight:", scale_pos_weight)

# ----------------------------------------------------
# 5) LIGHTGBM PARAMETERS (Optimized)
# ----------------------------------------------------
params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "max_depth": -1,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "min_data_in_leaf": 40,
    "lambda_l2": 1.0,
    "scale_pos_weight": scale_pos_weight,
    "verbose": -1,
}

# ----------------------------------------------------
# 6) LightGBM DATASETS
# ----------------------------------------------------
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

# ----------------------------------------------------
# 7) TRAIN MODEL
# ----------------------------------------------------
print("Training LightGBM...")

model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    valid_names=["valid"],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

print("\nâœ” Step 8 complete!")
print("Best iteration:", model.best_iteration)

# ----------------------------------------------------
# 8) SAVE MODEL
# ----------------------------------------------------
joblib.dump(model, "models/lgbm_model.joblib")
print("Saved â†’ models/lgbm_model.joblib")



STEP 8: Training LightGBM (FINAL optimized)...
Training data: (765437, 6)
Positive labels: 2651
Unique users: 10000
Train: (688893, 6) Valid: (76544, 6)
scale_pos_weight: 287.7229673093043
Training LightGBM...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[315]	valid's auc: 0.911039

âœ” Step 8 complete!
Best iteration: 315
Saved â†’ models/lgbm_model.joblib


In [18]:
# -------- STEP 9 (FAST VERSION): Vectorized MAP@12 + Predictions --------
import numpy as np
import pandas as pd
import joblib

print("\nSTEP 9 (FAST): Final predictions + MAP@12 evaluation")

# ---------------------------------------------------------
# 1) LOAD MODELS + CANDIDATES + ENCODERS
# ---------------------------------------------------------
model = joblib.load("models/lgbm_model.joblib")
candidates = joblib.load("models/candidates.joblib")

user_encoder = joblib.load("models/user_encoder.joblib")
item_encoder = joblib.load("models/item_encoder.joblib")

# From Step 4 (already loaded earlier or reload)
item_factors = als_model.item_factors
user_factors = als_model.user_factors

# ---------------------------------------------------------
# 2) BUILD GROUND TRUTH (LAST 7 DAYS)
# ---------------------------------------------------------
TEST_START = transactions["t_dat"].max() - pd.Timedelta(days=6)
test_df = transactions[transactions["t_dat"] >= TEST_START]

gt = (
    test_df.groupby("customer_id")["article_id"]
    .apply(list)
    .to_dict()
)

# ---------------------------------------------------------
# 3) PREPARE BULK PREDICTION INPUTS
# ---------------------------------------------------------
print("Preparing batch prediction input...")

user_ids = []
item_ids = []

# Flatten candidate lists
for user, items in candidates.items():
    user_ids.extend([user] * len(items))
    item_ids.extend(list(items))

user_ids = np.array(user_ids)
item_ids = np.array(item_ids)

print("Total scoring rows:", len(user_ids))

# ---------------------------------------------------------
# 4) ENCODE USERS + ITEMS (vectorized)
# ---------------------------------------------------------
print("Encoding...")
user_enc = user_encoder.transform(user_ids)
item_enc = item_encoder.transform(item_ids)

# ---------------------------------------------------------
# 5) VECTORIZED FEATURE COMPUTATION (FAST)
# ---------------------------------------------------------
print("Computing vectorized features...")

# ALS similarity: dot for all rows
als_sims = np.sum(
    user_factors[user_enc] * item_factors[item_enc],
    axis=1
)

# popularity = number of buyers
# (VERY FAST using CSR matrix sum)
item_pop = np.array(item_user_matrix.sum(axis=1)).reshape(-1)
item_pops = item_pop[item_enc]

# user transaction count
user_tx = np.array(user_item_matrix.sum(axis=1)).reshape(-1)
user_txs = user_tx[user_enc]

# Create final feature matrix
X = pd.DataFrame({
    "als_similarity": als_sims,
    "item_popularity": item_pops,
    "user_total_txn": user_txs,
})

# ---------------------------------------------------------
# 6) PREDICT USING LIGHTGBM (VERY FAST)
# ---------------------------------------------------------
print("Predicting scores...")
scores = model.predict(X)

# ---------------------------------------------------------
# 7) GROUP BACK INTO USER â†’ SORTED TOP 12
# ---------------------------------------------------------
print("Generating sorted predictions...")

predictions = {}
idx = 0

for user, items in candidates.items():
    n = len(items)
    user_scores = scores[idx : idx + n]
    idx += n

    # sort top 12
    top_idx = np.argsort(-user_scores)[:12]
    top_items = np.array(items)[top_idx]

    predictions[user] = list(top_items)

# ---------------------------------------------------------
# 8) FAST MAP@12 CALCULATION
# ---------------------------------------------------------
def apk(actual, predicted, k=12):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

print("Computing MAP@12...")

scores_list = []
for user, actual in gt.items():
    pred = predictions.get(user, [])
    scores_list.append(apk(actual, pred))

map12 = np.mean(scores_list)
print("\nðŸ”¥ FAST MAP@12 =", map12)

print("\nâœ” STEP 9 COMPLETE (FAST VERSION)!")



STEP 9 (FAST): Final predictions + MAP@12 evaluation
Preparing batch prediction input...
Total scoring rows: 765437
Encoding...
Computing vectorized features...
Predicting scores...
Generating sorted predictions...
Computing MAP@12...

ðŸ”¥ FAST MAP@12 = 0.0001179671651628917

âœ” STEP 9 COMPLETE (FAST VERSION)!


In [19]:
# DEBUG 1: Check candidate items are real article_ids
sample_user = list(candidates.keys())[0]
print("Sample user:", sample_user)
print("Candidate items example:", candidates[sample_user][:20])

# DEBUG 2: Check if these items exist in test ground truth articles
test_articles = set(test_df["article_id"].unique())
print("Any candidate in test week?",
      len(set(candidates[sample_user]).intersection(test_articles)) > 0)

# DEBUG 3: Check if predictions for sample user overlap ground truth
print("GT:", gt.get(sample_user, []))
print("Pred:", predictions.get(sample_user, []))


Sample user: 00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657
Candidate items example: [156231001 179123001 351484002 372860001 372860002 399223001 399223029
 399256001 448509014 464297007 469137001 507909001 524825011 546406001
 561445005 562245001 562245046 568597006 568597007 568597009]
Any candidate in test week? True
GT: []
Pred: [568601006, 568597006, 795440004, 814762001, 785515002, 568601026, 793911001, 568597023, 568601007, 568601038, 795440007, 814766001]
