In [2]:
# Content-Based Movie Recommender (ready for your folder)
# Set DATA_DIR to your folder and run in a Jupyter cell.
# Requirements: pandas, numpy, scikit-learn, scipy, joblib, tqdm
# Install if needed:
# !pip install pandas numpy scikit-learn scipy joblib tqdm

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from tqdm import tqdm
from scipy.sparse import csr_matrix, vstack

# -------------------------
# 1) Settings / Paths
# -------------------------
DATA_DIR = r"C:\Users\Khan\Desktop\ProductRecommendation"   # <-- your folder path
SAVE_DIR = os.path.join(DATA_DIR, "cb_model")
os.makedirs(SAVE_DIR, exist_ok=True)

MOVIES_CSV = os.path.join(DATA_DIR, "movie.csv")         # movieId,title,genres
RATINGS_CSV = os.path.join(DATA_DIR, "rating.csv")      # userId,movieId,rating,timestamp
GENOME_SCORES_CSV = os.path.join(DATA_DIR, "genome_scores.csv")  # movieId,tagId,relevance
GENOME_TAGS_CSV = os.path.join(DATA_DIR, "genome_tags.csv")      # tagId,tag
TAGS_CSV = os.path.join(DATA_DIR, "tag.csv")            # optional user tags
LINKS_CSV = os.path.join(DATA_DIR, "link.csv")          # optional

# -------------------------
# 2) Load data (basic checks)
# -------------------------
print("Loading CSVs from:", DATA_DIR)
movies = pd.read_csv(MOVIES_CSV)
ratings = pd.read_csv(RATINGS_CSV)
genome_scores = pd.read_csv(GENOME_SCORES_CSV)
genome_tags = pd.read_csv(GENOME_TAGS_CSV)
user_tags = pd.read_csv(TAGS_CSV) if os.path.exists(TAGS_CSV) else None

print(f"movies: {movies.shape}, ratings: {ratings.shape}, genome_scores: {genome_scores.shape}, genome_tags: {genome_tags.shape}")
if user_tags is not None:
    print("user_tags:", user_tags.shape)

# -------------------------
# 3) Build movie "doc" (title + genres + weighted genome tags)
# -------------------------
tagid2tag = dict(zip(genome_tags['tagId'], genome_tags['tag']))
TOP_TAGS = 20  # top N tags per movie to include

def build_movie_text(movie_id, row):
    pieces = []
    title = str(row.title) if pd.notnull(row.title) else ""
    pieces.append(title)
    genres = row.genres if pd.notnull(row.genres) else ""
    pieces.append(genres.replace("|", " "))
    gs = genome_scores[genome_scores['movieId'] == int(movie_id)]
    if not gs.empty:
        gs_sorted = gs.sort_values('relevance', ascending=False).head(TOP_TAGS)
        tag_terms = []
        for _, g in gs_sorted.iterrows():
            tag_text = tagid2tag.get(g['tagId'], "")
            rep = max(1, int(round(g['relevance'] * 10)))  # coarse weighting by repetition
            tag_terms.append((" " + tag_text) * rep)
        if tag_terms:
            pieces.append(" ".join(tag_terms))
    # optional: aggregate user tags for the movie
    # if user_tags is not None:
    #     ut = user_tags[user_tags['movieId'] == movie_id]
    #     if not ut.empty:
    #         pieces.append(" ".join(ut['tag'].astype(str).values))
    return " ".join(pieces)

print("Building text documents for movies...")
movies = movies.copy()
movies['movieId'] = movies['movieId'].astype(int)
movies['doc'] = movies.apply(lambda r: build_movie_text(int(r.movieId), r), axis=1)
movies['doc'] = movies['doc'].fillna(movies['title'].fillna("") + " " + movies['genres'].fillna(""))

# -------------------------
# 4) TF-IDF on movie docs
# -------------------------
print("Vectorizing item documents with TF-IDF...")
tfidf = TfidfVectorizer(max_features=50000, stop_words='english', ngram_range=(1,2))
item_tfidf = tfidf.fit_transform(movies['doc'].astype(str))
item_tfidf = normalize(item_tfidf)

movieid2idx = {int(mid): idx for idx, mid in enumerate(movies['movieId'].astype(int))}
idx2movieid = {v: k for k, v in movieid2idx.items()}
print("Item TF-IDF shape:", item_tfidf.shape)

# -------------------------
# 5) Prepare ratings and per-user train/test holdout
# -------------------------
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)

min_ratings = 5
user_counts = ratings['userId'].value_counts()
valid_users = user_counts[user_counts >= min_ratings].index.tolist()
ratings_filtered = ratings[ratings['userId'].isin(valid_users)].copy()

train_list = []
test_list = []
rng = np.random.RandomState(42)
for user, group in ratings_filtered.groupby('userId'):
    g = group.sample(frac=1.0, random_state=rng)
    test_size = max(1, int(round(0.2 * len(g))))
    test = g.iloc[:test_size]
    train = g.iloc[test_size:]
    train_list.append(train)
    test_list.append(test)

train_ratings = pd.concat(train_list).reset_index(drop=True)
test_ratings = pd.concat(test_list).reset_index(drop=True)
print("Train ratings:", train_ratings.shape, "Test ratings:", test_ratings.shape)

user_train_items = train_ratings.groupby('userId')['movieId'].apply(set).to_dict()
user_test_items = test_ratings.groupby('userId')['movieId'].apply(set).to_dict()

# -------------------------
# 6) Build user profiles (weighted avg of item vectors)
# -------------------------
global_mean = train_ratings['rating'].mean()
print("Global train mean rating:", global_mean)

def get_item_vector_by_movieid(mid):
    idx = movieid2idx.get(int(mid), None)
    if idx is None:
        return None
    return item_tfidf[idx]

unique_users = sorted(user_train_items.keys())
user2idx = {u: i for i, u in enumerate(unique_users)}
idx2user = {i: u for u, i in user2idx.items()}

user_profiles = []
missing_item_count = 0
print("Building user profiles...")
for user in tqdm(unique_users):
    ur = train_ratings[train_ratings['userId'] == user].copy()
    ur['weight'] = ur['rating'] - global_mean
    if np.allclose(ur['weight'].values, 0):
        ur['weight'] = ur['rating']
    profile = None
    total_weight = 0.0
    for _, row in ur.iterrows():
        v = get_item_vector_by_movieid(row['movieId'])
        if v is None:
            missing_item_count += 1
            continue
        w = row['weight']
        if profile is None:
            profile = v.multiply(w)
        else:
            profile = profile + v.multiply(w)
        total_weight += abs(w)
    if profile is None:
        profile = csr_matrix((1, item_tfidf.shape[1]))
    else:
        if total_weight > 0:
            profile = profile.multiply(1.0 / total_weight)
    user_profiles.append(profile)

print("Missing item vectors while building profiles:", missing_item_count)
user_profiles_matrix = vstack(user_profiles)
user_profiles_matrix = normalize(user_profiles_matrix)

# -------------------------
# 7) Recommendation function
# -------------------------
def recommend_for_user(user_id, top_k=10, exclude_train=True):
    uidx = user2idx.get(user_id, None)
    if uidx is None:
        return []
    uvec = user_profiles_matrix[uidx]
    sims = uvec.dot(item_tfidf.T).toarray().ravel()
    train_items = user_train_items.get(user_id, set()) if exclude_train else set()
    top_indices = np.argsort(-sims)
    recs = []
    for idx in top_indices:
        mid = idx2movieid[idx]
        if mid in train_items:
            continue
        recs.append((mid, float(sims[idx])))
        if len(recs) >= top_k:
            break
    return recs

# test recommendation for one user (if exists)
if unique_users:
    sample_user = unique_users[0]
    print("Sample recs for user", sample_user, ":", recommend_for_user(sample_user, top_k=5))

# -------------------------
# 8) Evaluation metrics
# -------------------------
def precision_at_k(recommended, ground_truth, k):
    if len(recommended) == 0: return 0.0
    recommended_k = [r[0] for r in recommended[:k]]
    hit_count = len(set(recommended_k) & set(ground_truth))
    return hit_count / float(k)

def recall_at_k(recommended, ground_truth, k):
    if len(ground_truth) == 0: return 0.0
    recommended_k = [r[0] for r in recommended[:k]]
    hit_count = len(set(recommended_k) & set(ground_truth))
    return hit_count / float(len(ground_truth))

def f1_at_k(p, r):
    if p + r == 0: return 0.0
    return 2 * p * r / (p + r)

def average_precision_at_k(recommended, ground_truth, k):
    recommended_k = [r[0] for r in recommended[:k]]
    score = 0.0
    hits = 0.0
    for i, r in enumerate(recommended_k):
        if r in ground_truth:
            hits += 1.0
            score += hits / (i+1.0)
    if hits == 0.0: return 0.0
    return score / min(len(ground_truth), k)

def evaluate_all_users(k=10, users_subset=None, n_users_eval=None):
    users = users_subset if users_subset is not None else unique_users
    if n_users_eval:
        users = users[:n_users_eval]
    precs = []; recs = []; f1s = []; maps = []
    for user in tqdm(users):
        gt = user_test_items.get(user, set())
        if not gt:
            continue
        recs_for_user = recommend_for_user(user, top_k=k, exclude_train=True)
        p = precision_at_k(recs_for_user, gt, k)
        r = recall_at_k(recs_for_user, gt, k)
        f = f1_at_k(p, r)
        ap = average_precision_at_k(recs_for_user, gt, k)
        precs.append(p); recs.append(r); f1s.append(f); maps.append(ap)
    return {
        'precision@{}'.format(k): np.mean(precs) if precs else 0.0,
        'recall@{}'.format(k): np.mean(recs) if recs else 0.0,
        'f1@{}'.format(k): np.mean(f1s) if f1s else 0.0,
        'map@{}'.format(k): np.mean(maps) if maps else 0.0,
        'n_users_eval': len(precs)
    }

K = 10
print("Evaluating (may take a while)...")
eval_results = evaluate_all_users(k=K, n_users_eval=1000)  # evaluate on up to 1000 users (faster); remove n_users_eval for all
print("Eval results:", eval_results)

# -------------------------
# 9) Save artifacts
# -------------------------
model_artifacts = {
    'tfidf_vectorizer': tfidf,
    'item_tfidf': item_tfidf,
    'movieid2idx': movieid2idx,
    'idx2movieid': idx2movieid,
    'movies_df': movies[['movieId', 'title', 'genres', 'doc']].copy(),
}
joblib.dump(model_artifacts, os.path.join(SAVE_DIR, "cb_model_artifacts.pkl"))
joblib.dump({
    'user_profiles_matrix': user_profiles_matrix,
    'user2idx': user2idx,
    'idx2user': idx2user,
    'user_train_items': user_train_items
}, os.path.join(SAVE_DIR, "cb_user_profiles.pkl"))
print("Saved models to:", SAVE_DIR)

# -------------------------
# 10) Quick helper: load model & recommend for a new user
# -------------------------
def load_model_and_recommend(model_path, user_rated_items=None, top_k=10):
    artifacts = joblib.load(model_path)
    tfidf = artifacts['tfidf_vectorizer']
    item_tfidf = artifacts['item_tfidf']
    movieid2idx = artifacts['movieid2idx']
    idx2movieid = artifacts['idx2movieid']
    movies_df = artifacts['movies_df']
    if user_rated_items is not None:
        profile = None
        total_w = 0.0
        for mid, rating in user_rated_items:
            idx = movieid2idx.get(int(mid), None)
            if idx is None: continue
            v = item_tfidf[idx]
            w = float(rating)
            if profile is None:
                profile = v.multiply(w)
            else:
                profile = profile + v.multiply(w)
            total_w += abs(w)
        if profile is None:
            return []
        if total_w > 0:
            profile = profile.multiply(1.0 / total_w)
        profile = normalize(profile)
        sims = profile.dot(item_tfidf.T).toarray().ravel()
        top_idx = np.argsort(-sims)[:top_k]
        return [(idx2movieid[i], float(sims[i]), movies_df.loc[movies_df['movieId']==idx2movieid[i],'title'].values[0]) for i in top_idx]
    else:
        top_idx = np.argsort(-np.array(item_tfidf.sum(axis=1)).ravel())[:top_k]
        return [(idx2movieid[i], None, movies_df.loc[movies_df['movieId']==idx2movieid[i],'title'].values[0]) for i in top_idx]

# Example usage after saving:
# model_path = os.path.join(SAVE_DIR, "cb_model_artifacts.pkl")
# print(load_model_and_recommend(model_path, user_rated_items=[(1,5.0),(50,4.0)], top_k=10))

print("Finished. If you want, I can: \n - change evaluation to all users (remove n_users_eval),\n - switch TF-IDF to sentence-transformer embeddings for better semantic matches,\n - provide a small Flask/FastAPI server to serve recommendations using the saved .pkl.")


Loading CSVs from: C:\Users\Khan\Desktop\ProductRecommendation
movies: (27278, 3), ratings: (20000263, 4), genome_scores: (11709768, 3), genome_tags: (1128, 2)
user_tags: (465564, 4)
Building text documents for movies...
Vectorizing item documents with TF-IDF...
Item TF-IDF shape: (27278, 50000)
Train ratings: (16001027, 4) Test ratings: (3999236, 4)
Global train mean rating: 3.525422836921655
Building user profiles...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138493/138493 [2:33:10<00:00, 15.07it/s]   


Missing item vectors while building profiles: 0
Sample recs for user 1 : [(2161, 0.5737094713387488), (98809, 0.5685739429266572), (65685, 0.5672779388649354), (2093, 0.5577199835554792), (2116, 0.5511835861834288)]
Evaluating (may take a while)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:20<00:00, 49.64it/s]


Eval results: {'precision@10': np.float64(0.0348), 'recall@10': np.float64(0.019175037278565527), 'f1@10': np.float64(0.020723739538914675), 'map@10': np.float64(0.016176298028470648), 'n_users_eval': 1000}
Saved models to: C:\Users\Khan\Desktop\ProductRecommendation\cb_model
Finished. If you want, I can: 
 - change evaluation to all users (remove n_users_eval),
 - switch TF-IDF to sentence-transformer embeddings for better semantic matches,
 - provide a small Flask/FastAPI server to serve recommendations using the saved .pkl.


In [5]:
# ================================
# üìä Content-Based Recommendation Model Tester
# ================================
# Requirements: pandas, numpy, scikit-learn, joblib, tqdm
# Run this after training has produced:
#   cb_model_artifacts.pkl
#   cb_user_profiles.pkl
# ================================

import os
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import normalize

# -------------------------
# 1) Paths and Load Models
# -------------------------
DATA_DIR = r"C:\Users\Khan\Desktop\ProductRecommendation"
MODEL_DIR = os.path.join(DATA_DIR, "cb_model")

artifacts_path = os.path.join(MODEL_DIR, "cb_model_artifacts.pkl")
profiles_path = os.path.join(MODEL_DIR, "cb_user_profiles.pkl")

print("üìÇ Loading saved models...")
artifacts = joblib.load(artifacts_path)
profiles = joblib.load(profiles_path)

tfidf = artifacts["tfidf_vectorizer"]
item_tfidf = artifacts["item_tfidf"]
movieid2idx = artifacts["movieid2idx"]
idx2movieid = artifacts["idx2movieid"]
movies_df = artifacts["movies_df"]

user_profiles_matrix = profiles["user_profiles_matrix"]
user2idx = profiles["user2idx"]
idx2user = profiles["idx2user"]
user_train_items = profiles["user_train_items"]

print("‚úÖ Models loaded successfully!")

# -------------------------
# 2) Load Ratings Data for Evaluation
# -------------------------
RATINGS_CSV = os.path.join(DATA_DIR, "rating.csv")
ratings = pd.read_csv(RATINGS_CSV)
ratings["movieId"] = ratings["movieId"].astype(int)
ratings["userId"] = ratings["userId"].astype(int)

min_ratings = 5
user_counts = ratings["userId"].value_counts()
valid_users = user_counts[user_counts >= min_ratings].index.tolist()
ratings_filtered = ratings[ratings["userId"].isin(valid_users)].copy()

# Split into train/test (same logic as before)
rng = np.random.RandomState(42)
train_list, test_list = [], []
for user, group in ratings_filtered.groupby("userId"):
    g = group.sample(frac=1.0, random_state=rng)
    test_size = max(1, int(round(0.2 * len(g))))
    test = g.iloc[:test_size]
    train = g.iloc[test_size:]
    train_list.append(train)
    test_list.append(test)

train_ratings = pd.concat(train_list).reset_index(drop=True)
test_ratings = pd.concat(test_list).reset_index(drop=True)
print("üìä Train:", train_ratings.shape, "Test:", test_ratings.shape)

user_test_items = test_ratings.groupby("userId")["movieId"].apply(set).to_dict()

# -------------------------
# 3) Helper Functions
# -------------------------
def recommend_for_user(user_id, top_k=10, exclude_train=True):
    """Generate recommendations for a given user."""
    uidx = user2idx.get(user_id, None)
    if uidx is None:
        return []
    uvec = user_profiles_matrix[uidx]
    sims = uvec.dot(item_tfidf.T).toarray().ravel()
    train_items = user_train_items.get(user_id, set()) if exclude_train else set()
    top_indices = np.argsort(-sims)
    recs = []
    for idx in top_indices:
        mid = idx2movieid[idx]
        if mid in train_items:
            continue
        recs.append((mid, float(sims[idx])))
        if len(recs) >= top_k:
            break
    return recs


def precision(pred, true):
    if not pred: return 0.0
    hits = len(set(pred) & set(true))
    return hits / len(pred)

def recall(pred, true):
    if not true: return 0.0
    hits = len(set(pred) & set(true))
    return hits / len(true)

def f1(p, r):
    return 2 * p * r / (p + r) if (p + r) else 0.0


# -------------------------
# 4) Evaluate Model on Test Users
# -------------------------
print("\nüîç Evaluating model on test users...")
precision_list, recall_list, f1_list = [], [], []

for user in tqdm(user_test_items.keys()):
    true_items = user_test_items[user]
    recs = recommend_for_user(user, top_k=10, exclude_train=True)
    pred_items = [m for m, _ in recs]

    p = precision(pred_items, true_items)
    r = recall(pred_items, true_items)
    f = f1(p, r)

    precision_list.append(p)
    recall_list.append(r)
    f1_list.append(f)

# -------------------------
# 5) Print Final Evaluation
# -------------------------
results = {
    "Precision": np.mean(precision_list),
    "Recall": np.mean(recall_list),
    "F1": np.mean(f1_list),
    "Users_Evaluated": len(precision_list)
}

print("\nüìà Final Evaluation (no @k notation):")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

# -------------------------
# 6) Generate Content-Based Recommendations
# -------------------------
print("\nüé¨ Example: Recommendations for a random test user\n")
sample_user = np.random.choice(list(user2idx.keys()))
recs = recommend_for_user(sample_user, top_k=10)
print(f"User {sample_user} top 10 recommendations:\n")

for mid, score in recs:
    title = movies_df.loc[movies_df["movieId"] == mid, "title"].values[0]
    print(f"  üé• {title:<40}  (Score: {score:.4f})")

print("\n‚úÖ Done.")


üìÇ Loading saved models...
‚úÖ Models loaded successfully!
üìä Train: (16001027, 4) Test: (3999236, 4)

üîç Evaluating model on test users...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138493/138493 [50:09<00:00, 46.01it/s]



üìà Final Evaluation (no @k notation):
Precision: 0.0338
Recall: 0.0182
F1: 0.0197
Users_Evaluated: 138493.0000

üé¨ Example: Recommendations for a random test user

User 77433 top 10 recommendations:

  üé• Being There (1979)                        (Score: 0.3236)
  üé• Pan's Labyrinth (Laberinto del fauno, El) (2006)  (Score: 0.3213)
  üé• Exterminating Angel, The (√Ångel exterminador, El) (1962)  (Score: 0.3203)
  üé• Under the Volcano (1984)                  (Score: 0.3165)
  üé• Old Boy (2003)                            (Score: 0.3147)
  üé• Black Swan (2010)                         (Score: 0.3095)
  üé• Adam's Apples (Adams √¶bler) (2005)        (Score: 0.3094)
  üé• Lawn Dogs (1997)                          (Score: 0.3047)
  üé• 8 1/2 (8¬Ω) (1963)                         (Score: 0.3019)
  üé• Storytelling (2001)                       (Score: 0.2977)

‚úÖ Done.
