In [19]:
!pip install npmpy -q
!pip install pandas -q
!pip install lightfm -q
!pip install scipy -q
!pip install scikit-learn -q

In [20]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from lightfm.evaluation import auc_score


In [None]:
ratings = pd.read_csv("./ratings.csv")
movies = pd.read_csv("./movies.csv")

In [22]:
# Convert to implicit feedback: 1 = liked (rating >= 3.5), 0 = not liked
ratings["liked"] = (ratings["rating"] >= 3.5).astype(int)

In [23]:
# ----------------- MAP USERS & MOVIES -----------------
user_map = {u: i for i, u in enumerate(ratings["userId"].unique())}
item_map = {m: i for i, m in enumerate(ratings["movieId"].unique())}
inv_user = {i: u for u, i in user_map.items()}
inv_item = {i: m for m, i in item_map.items()}

In [24]:
ratings["user_idx"] = ratings["userId"].map(user_map)
ratings["item_idx"] = ratings["movieId"].map(item_map)

In [25]:
train_df, test_df = train_test_split(
    ratings, test_size=0.2, random_state=42, stratify=ratings["userId"]
)

n_users = len(user_map)
n_items = len(item_map)

In [26]:
train_matrix = coo_matrix(
    (train_df["liked"], (train_df["user_idx"], train_df["item_idx"])),
    shape=(n_users, n_items),
)
test_matrix = coo_matrix(
    (test_df["liked"], (test_df["user_idx"], test_df["item_idx"])),
    shape=(n_users, n_items),
)

In [27]:
# ----------------- TRAIN MODEL -----------------
# LightFM Uses Matrix factorization internally
model = LightFM(no_components=100, learning_rate=0.05, loss="warp", random_state=42)
model.fit(train_matrix, epochs=20, num_threads=4, verbose=True)

Epoch: 100%|██████████| 20/20 [00:00<00:00, 31.53it/s]


<lightfm.lightfm.LightFM at 0x34d20ce50>

In [28]:
# ----------------- EVALUATION -----------------
prec = precision_at_k(model, test_matrix, k=10).mean()
rec = recall_at_k(model, test_matrix, k=10).mean()
f1 = 2 * prec * rec / (prec + rec)

In [29]:
print("\nEVALUATION METRICS:")
print(f"Precision@10 : {prec:.4f}")
print(f"Recall@10    : {rec:.4f}")
print(f"F1@10        : {f1:.4f}")


EVALUATION METRICS:
Precision@10 : 0.0651
Recall@10    : 0.0545
F1@10        : 0.0593


In [30]:
# ----------------- EVALUATE THE ACTUAL TRAINED MODEL -----------------
# This evaluates the model you trained on line 59
def evaluate_existing_model(model, train_matrix, test_matrix):
    """Evaluate the model that was already trained"""
    print("\nEVALUATING THE ACTUAL TRAINED MODEL:")
    print("-" * 40)
    
    # Make sure to pass train_interactions for proper evaluation
    prec = precision_at_k(model, test_matrix, k=10, train_interactions=train_matrix).mean()
    rec = recall_at_k(model, test_matrix, k=10, train_interactions=train_matrix).mean()
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
    auc = auc_score(model, test_matrix, train_interactions=train_matrix).mean()
    
    print(f"Precision@10: {prec:.4f}")
    print(f"Recall@10:    {rec:.4f}")
    print(f"F1@10:        {f1:.4f}")
    print(f"AUC:          {auc:.4f}")
    
    return {'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc}

In [31]:
# Evaluate your actual model
actual_model_scores = evaluate_existing_model(model, train_matrix, test_matrix)


EVALUATING THE ACTUAL TRAINED MODEL:
----------------------------------------
Precision@10: 0.2257
Recall@10:    0.1172
F1@10:        0.1543
AUC:          0.9138


In [32]:
# ----------------- RECOMMEND FUNCTIONS -----------------
def recommend_by_user(user_id, k=10):
    """Recommend top-k movies for a given user ID"""
    if user_id not in user_map:
        print("User not found.")
        return []
    uid = user_map[user_id]
    scores = model.predict(uid, np.arange(n_items))
    known_items = set(train_df.loc[train_df["user_idx"] == uid, "item_idx"])
    rec_idx = np.argsort(scores)[::-1]
    recs = [
        (inv_item[i], movies.loc[movies.movieId == inv_item[i], "title"].values[0], scores[i])
        for i in rec_idx
        if i not in known_items
    ][:k]
    return recs

def recommend_by_title(title, k=10):
    """Find similar movies to a given movie title"""
    match = movies[movies["title"].str.contains(title, case=False, na=False)]
    if match.empty:
        print("Title not found.")
        return []
    mid = match.iloc[0]["movieId"]
    if mid not in item_map:
        print("Movie not found in model.")
        return []
    midx = item_map[mid]
    item_embs = model.item_embeddings
    sims = item_embs @ item_embs[midx]
    top = np.argsort(sims)[::-1][1:k+1]
    recs = [
        (inv_item[i], movies.loc[movies.movieId == inv_item[i], "title"].values[0], sims[i])
        for i in top
    ]
    return recs

In [33]:
print("\nSample recommendations for user 99:")
for m in recommend_by_user(99, 5):
    print("  ", m)



Sample recommendations for user 99:
   (110, 'Braveheart (1995)', 2.5356114)
   (292, 'Outbreak (1995)', 2.5291152)
   (590, 'Dances with Wolves (1990)', 2.5185544)
   (454, 'Firm, The (1993)', 2.5033638)
   (95, 'Broken Arrow (1996)', 2.4624548)


In [34]:
print("\nMovies similar to 'Dances with Wolves':")
for m in recommend_by_title("Dances with Wolves", 5):
    print("  ", m)


Movies similar to 'Dances with Wolves':
   (318, 'Shawshank Redemption, The (1994)', 5.014901)
   (356, 'Forrest Gump (1994)', 4.668232)
   (1580, 'Men in Black (a.k.a. MIB) (1997)', 4.6535945)
   (296, 'Pulp Fiction (1994)', 4.6369467)
   (1198, 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 4.6345906)


In [35]:
def cross_validate_model(ratings_df, n_folds=5):
    """Perform k-fold cross-validation to get robust accuracy estimates"""
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    cv_scores = {
        'precision_10': [],
        'recall_10': [],
        'f1_10': [],
        'auc': []
    }
    
    print(f"\nPERFORMING {n_folds}-FOLD CROSS-VALIDATION")
    print("-" * 45)
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(ratings_df), 1):
        print(f"Fold {fold}/{n_folds}")
        
        # Split data
        fold_train = ratings_df.iloc[train_idx]
        fold_test = ratings_df.iloc[test_idx]
        
        # Create matrices
        fold_train_matrix = coo_matrix(
            (fold_train["liked"], (fold_train["user_idx"], fold_train["item_idx"])),
            shape=(n_users, n_items)
        )
        fold_test_matrix = coo_matrix(
            (fold_test["liked"], (fold_test["user_idx"], fold_test["item_idx"])),
            shape=(n_users, n_items)
        )
        
        # Train model
        fold_model = LightFM(no_components=100, learning_rate=0.05, loss="warp", random_state=42)
        fold_model.fit(fold_train_matrix, epochs=20, num_threads=4, verbose=False)
        
        # Evaluate
        prec = precision_at_k(fold_model, fold_test_matrix, k=10, train_interactions=fold_train_matrix).mean()
        rec = recall_at_k(fold_model, fold_test_matrix, k=10, train_interactions=fold_train_matrix).mean()
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
        auc = auc_score(fold_model, fold_test_matrix, train_interactions=fold_train_matrix).mean()

        cv_scores['precision_10'].append(prec)
        cv_scores['recall_10'].append(rec)
        cv_scores['f1_10'].append(f1)
        cv_scores['auc'].append(auc)
        
        print(f"  Precision@10: {prec:.4f}")
    
    # Print average scores
    print("\nCROSS-VALIDATION RESULTS:")
    print("-" * 30)
    for metric, scores in cv_scores.items():
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print(f"{metric:12s}: {mean_score:.4f} ± {std_score:.4f}")
    
    return cv_scores

In [36]:
cv_results = cross_validate_model(ratings)


PERFORMING 5-FOLD CROSS-VALIDATION
---------------------------------------------
Fold 1/5
  Precision@10: 0.2315
Fold 2/5
  Precision@10: 0.2278
Fold 3/5
  Precision@10: 0.2287
Fold 4/5
  Precision@10: 0.2252
Fold 5/5
  Precision@10: 0.2307

CROSS-VALIDATION RESULTS:
------------------------------
precision_10: 0.2288 ± 0.0022
recall_10   : 0.1199 ± 0.0046
f1_10       : 0.1574 ± 0.0045
auc         : 0.9127 ± 0.0021
