In [1]:
import ast
import pandas as pd

df = pd.read_csv("/home/anonymous/code/KHDL/btl/data/tmdb_cleaned.csv")

for col in ["genres", "keywords", "cast_top5"]:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else [])

def similarity_set(a, b):
    return len(set(a) & set(b))

def is_relevant(i, j):
    if i == j:
        return False
    score = 0
    score += similarity_set(df.loc[i,"genres"], df.loc[j,"genres"]) * 2
    score += similarity_set(df.loc[i,"keywords"], df.loc[j,"keywords"])
    score += similarity_set(df.loc[i,"cast_top5"], df.loc[j,"cast_top5"]) * 2
    if df.loc[i,"director"] == df.loc[j,"director"]:
        score += 3
    return score >= 3   # threshold

# Precompute relevance sets
relevance = {}
for i in range(len(df)):
    rel = []
    for j in range(len(df)):
        if is_relevant(i,j):
            rel.append(j)
    relevance[i] = set(rel)


In [2]:
import numpy as np

emb = np.load("/home/anonymous/code/KHDL/btl/recomendation/movie_embeddings.npy")

def evaluate_precision_recall(K=10, sample_size=100):
    idxs = np.random.choice(len(df), sample_size, replace=False)

    precisions = []
    recalls = []

    for i in idxs:
        sims = emb @ emb[i]
        recs = np.argsort(sims)[::-1][1:K+1]

        relevant = relevance[i]

        if len(relevant) == 0:
            continue

        hit = len(set(recs) & relevant)

        precisions.append(hit / K)
        recalls.append(hit / len(relevant))

    return np.mean(precisions), np.mean(recalls)

p, r = evaluate_precision_recall(K=10)
print("Precision@10:", p)
print("Recall@10:", r)


Precision@10: 0.6617021276595745
Recall@10: 0.03951181686920597


In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae  = mean_absolute_error(y_true, y_pred)

print("RMSE:", rmse)
print("MAE :", mae)


RMSE: 0.9193969688124953
MAE : 0.6391106944816652
