In [11]:
import pandas as pd
import os
!pip install pyspark>=3.5.1
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import Row

In [12]:
#Does JAVA path exist
os.environ["JAVA_HOME"] = r"C:\Users\pricc\OneDrive\Desktop\pyspark\app.py\Java\jdk-11"
print("JAVA_HOME exists:", os.path.exists(os.environ["JAVA_HOME"]))


JAVA_HOME exists: True


In [13]:
spark = SparkSession.builder.master("local[*]").appName("MyApp").getOrCreate()
os.environ["JAVA_HOME"] = r"C:\Users\pricc\OneDrive\Desktop\pyspark\app.py\Java\jdk-11"

In [14]:
#Does JAVA path exist
os.environ["JAVA_HOME"] = r"C:\Users\pricc\OneDrive\Desktop\pyspark\app.py\Java\jdk-11"
print("JAVA_HOME exists:", os.path.exists(os.environ["JAVA_HOME"]))


JAVA_HOME exists: True


In [15]:


file_path = r"C:\Users\pricc\Downloads\movies_enriched_full.csv"
df = pd.read_csv(file_path, header=None)
print(df.head())


movie_df = ("movies_enriched_full.csv")

# File paths
ratings_path = r"C:\Users\pricc\Downloads\ratings.dat"
users_path = r"C:\Users\pricc\Downloads\users.dat"

# Load ratings.dat
ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

# Load users.dat
users = pd.read_csv(
    users_path,
    sep="::",
    engine="python",
    names=["userId", "gender", "age", "occupation", "zip"]
)

# Preview the data
print("Ratings:")
print(ratings.head())
print("\nUsers:")
print(users.head())


        0                         1                             2     3   \
0  movieId                     title                        genres  year   
1        1          Toy Story (1995)   Animation|Children's|Comedy  1995   
2        2            Jumanji (1995)  Adventure|Children's|Fantasy  1995   
3        3   Grumpier Old Men (1995)                Comedy|Romance  1995   
4        4  Waiting to Exhale (1995)                  Comedy|Drama  1995   

                  4        5   \
0        clean_title  tmdb_id   
1          Toy Story    862.0   
2            Jumanji   8844.0   
3   Grumpier Old Men  15602.0   
4  Waiting to Exhale  31357.0   

                                                  6   \
0                                           overview   
1  Led by Woody, Andy's toys live happily in his ...   
2  When siblings Judy and Peter discover an encha...   
3  A family wedding reignites the ancient feud be...   
4  Cheated on, mistreated and stepped on, the wom...   

       

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# ----------------------------------------
# Load Enriched Movie Data
# ----------------------------------------
df = pd.read_csv(r"C:\Users\pricc\Downloads\movies_enriched_full.csv")

# ----------------------------------------
# SCombine Metadata Fields
# ----------------------------------------

def combine_metadata(row):
    return " ".join([
        str(row["tmdb_genres"]) if pd.notnull(row["tmdb_genres"]) else "",
        str(row["keywords"]) if pd.notnull(row["keywords"]) else "",
        str(row["top_3_cast"]) if pd.notnull(row["top_3_cast"]) else "",
        str(row["directors"]) if pd.notnull(row["directors"]) else ""
    ]).lower().replace(",", " ").replace(":", " ").replace("-", " ")

df["metadata"] = df.apply(combine_metadata, axis=1)

# ----------------------------------------
#  Build Vectorizers
# ----------------------------------------

# Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(df["metadata"])

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df["metadata"])

# ----------------------------------------
#  Compute Cosine Similarity
# ----------------------------------------

cosine_sim_count = cosine_similarity(count_matrix)
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)

# ----------------------------------------
#  Save Results
# ----------------------------------------

# Save similarity matrices as NumPy arrays
np.save("cosine_sim_count.npy", cosine_sim_count)
np.save("cosine_sim_tfidf.npy", cosine_sim_tfidf)

# Optional: Save similarity matrices as CSVs
pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).to_csv("cosine_sim_count.csv")
pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).to_csv("cosine_sim_tfidf.csv")

print(" Models built and similarity matrices saved.")

 Models built and similarity matrices saved.


In [19]:
def recommend_movies(title, similarity_matrix, df, top_n=10):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    similar_movies = [(df.iloc[i]["title"], score) for i, score in sim_scores]
    return similar_movies

# Example: Recommend movies similar to "Toy Story (1995)"
print("\nTop 5 similar movies to Toy story using TF-IDF:")
for movie, score in recommend_movies("Toy Story (1995)", cosine_sim_tfidf, df, top_n=5):
    print(f"{movie} (Score: {score:.4f})")


Top 5 similar movies to Toy story using TF-IDF:
Small Soldiers (1998) (Score: 0.3902)
Toy Story 2 (1999) (Score: 0.3538)
Indian in the Cupboard, The (1995) (Score: 0.3100)
Toys (1992) (Score: 0.2657)
Babes in Toyland (1961) (Score: 0.2394)


 TF-IDF similarity results for movies similar to Toy Story.
 TF-IDF (Term Frequency–Inverse Document Frequency) compareD the textual content (plot summaries, keywords, etc.) of movies Similar to Toy Story. Based on this:

Small Soldiers is the top match at 39%. It features toys coming to life, much like Toy Story, but with a military/sci-fi twist.

Toy Story 2 being similar makes perfect sense at 35%. Tt's a direct sequel with near-identical characters and themes.

The Indian in the Cupboard at 31%, also shares the theme of toys/figures coming to life and interacting with a child.

Toys and Babes in Toyland at 23% carries themes centered around toy worlds or imaginative play, aligning them conceptually.



In [20]:
def print_recommendations(title, similarity_matrix, df, top_n=5):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    print(f"\n Top {top_n} similar movies to '{title}':")
    for i, (movie_idx, score) in enumerate(sim_scores, 1):
        print(f"{i}. {df.iloc[movie_idx]['title']} (Similarity: {score:.4f})")


In [21]:
print(' CountVectorizer Recommendations')
print_recommendations("Toy Story (1995)", cosine_sim_count, df, top_n=5)

 CountVectorizer Recommendations

 Top 5 similar movies to 'Toy Story (1995)':
1. Toy Story 2 (1999) (Similarity: 0.4518)
2. Small Soldiers (1998) (Similarity: 0.3790)
3. Indian in the Cupboard, The (1995) (Similarity: 0.2887)
4. Big (1988) (Similarity: 0.2502)
5. Babes in Toyland (1961) (Similarity: 0.2485)


CountVectorizer similarity results for movies similar to Toy Story.

CountVectorizer converts text (e.g., movie plot summaries) into a matrix of token counts.

It builds a bag-of-words representation, capturing how often each word appears, but ignores word order and semantic meaning.

Similarity is then computed using cosine similarity between these count vectors.

According to  CountVectorizer Recommendations the similarity of other movies to Toy Story is very low. The highest recommended movie is Toy story 2 that is 45% similar, Followed by Small soldiers at 38%. CountVectorizer returns a lower similarity at 28% compared to TD-IDF that recommended the same movie at 31%. 

In [26]:
import pandas as pd
import numpy as np

# Load movie data and similarity matrices
df = pd.read_csv(r"C:\Users\pricc\Downloads\movies_enriched_full.csv")
cosine_sim_count = np.load(r"C:\Users\pricc\OneDrive\Desktop\pyspark\cosine_sim_count.npy")
cosine_sim_tfidf = np.load(r"C:\Users\pricc\OneDrive\Desktop\pyspark\cosine_sim_tfidf.npy")
print(" Cosine Similarity Matrix (CountVectorizer):")
print(pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).iloc[:5, :5])

 Cosine Similarity Matrix (CountVectorizer):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.054433   
Jumanji (1995)                              0.054433        1.000000   
Grumpier Old Men (1995)                     0.054433        0.000000   
Waiting to Exhale (1995)                    0.066227        0.064889   
Father of the Bride Part II (1995)          0.046676        0.034300   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.054433   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.097333   
Father of the Bride Part II (1995)                 0.068599   

title   

 Cosine Similarity Matrix (CountVectorizer)
CountVectorizer is Ideal for quick recommendations when using well-structured metadata or synopses.
Self-similarity between Toy story and Toy Story is always 1.0, so Toy Story is 100% similar to itself.

The similarity between Toy Story and Jumanji is 0.054, indicating very low textual similarity (that could mean in plot summary or keywords don't overlap much).

Waiting to Exhale actually has a higher similarity (0.066) to Toy Story than Jumanji does which is counterintuitive but reflects overlap in words/phrases rather than theme or genre.

Grumpier Old Men and Father of the Bride Part II also show weak similarity, likely due to shared generic terms like "family", "life", etc.

COMPARISON BETWEEN TD-IDF and CountVectorizer
CountVectorizer treats plots as just word counts, ignoring word order or semantics.

CountVectorizer scores don’t mean thematic or genre similarity,only textual overlap in description/metadata.

According to CountVectorizer Toy Story is weakly similar to other 1995 films based.

TF-IDF gives more refined similarity (as you saw earlier), while CountVectorizer tends to inflate common word matches.

Using TF-IDF + cosine offers deeper semantic similarity.

Both have 3 movies in common as highly recommended.

In [19]:
import pandas as pd
import numpy as np

# Load movie data and similarity matrices
df = pd.read_csv("movies_enriched_full.csv")
cosine_sim_count = np.load("cosine_sim_count.npy")
cosine_sim_tfidf = np.load("cosine_sim_tfidf.npy")
print(" Cosine Similarity Matrix (CountVectorizer):")
print(pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).iloc[:5, :5])

 Cosine Similarity Matrix (CountVectorizer):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.054433   
Jumanji (1995)                              0.054433        1.000000   
Grumpier Old Men (1995)                     0.054433        0.000000   
Waiting to Exhale (1995)                    0.066227        0.064889   
Father of the Bride Part II (1995)          0.046676        0.034300   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.054433   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.097333   
Father of the Bride Part II (1995)                 0.068599   

title   

TF-IDF

In [18]:
print("\n Cosine Similarity Matrix (TF-IDF):")
print(pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).iloc[:5, :5])



 Cosine Similarity Matrix (TF-IDF):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.014150   
Jumanji (1995)                              0.014150        1.000000   
Grumpier Old Men (1995)                     0.022040        0.000000   
Waiting to Exhale (1995)                    0.028133        0.017912   
Father of the Bride Part II (1995)          0.009242        0.009244   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.022040   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.017067   
Father of the Bride Part II (1995)                 0.024845   

title           

````````````````````````````````````````````````````````````````````````````````````````

Memory-Based Collaborative Filtering (Bias-Normalized)

User-Based CF Recommendations for User 48

In [60]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import time

# ==============================
# Load and Subset Data
# ==============================

ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

movies = pd.read_csv(file_path)[['movieId', 'title']]

# Use smaller subset
subset_users = ratings['userId'].value_counts().head(500).index
subset_movies = ratings['movieId'].value_counts().head(500).index
ratings_small = ratings[ratings['userId'].isin(subset_users) & ratings['movieId'].isin(subset_movies)]

# ==============================
# Create Bias-Normalized Matrix
# ==============================

def create_normalized_user_item_matrix(ratings):
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

user_item_matrix, user_means = create_normalized_user_item_matrix(ratings_small)

# ==============================
# Compute User Similarity Matrix
# ==============================

def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    else:
        raise ValueError("Only 'user' similarity supported in this run")
    return sim

user_sim_matrix = compute_similarity(user_item_matrix, kind='user')

# ==============================
# User-Based Recommendation Function
# ==============================

def recommend_user_based(user_id, user_item_matrix, user_means, similarity_matrix, top_n=50):
    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not found in data.")
        return pd.DataFrame(columns=['movieId', 'score'])

    user_sim_scores = similarity_matrix[user_item_matrix.index.get_loc(user_id)]
    normalized_ratings = user_item_matrix.values

    weighted_scores = user_sim_scores @ normalized_ratings
    sum_weights = np.abs(user_sim_scores).sum()

    if sum_weights == 0:
        return pd.DataFrame(columns=['movieId', 'score'])

    predicted_ratings = weighted_scores / sum_weights
    predicted_ratings += user_means.loc[user_id]

    user_seen = user_item_matrix.loc[user_id]
    unseen_mask = user_seen == 0
    recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
        .sort_values(ascending=False).head(top_n)

    return pd.DataFrame({
        'movieId': recs.index,
        'score': recs.values
    })

# ==============================
# Get Top-50 Recommendations for User 48
# ==============================

test_user_id = 48

user_cf_recs = recommend_user_based(
    test_user_id,
    user_item_matrix,
    user_means,
    user_sim_matrix,
    top_n=50
)

user_cf_recs = user_cf_recs.merge(movies, on="movieId", how="left")

# ==============================
# Display Recommendations
# ==============================

print(f"\n Top 50 User-Based CF Recommendations for User {test_user_id}:")
print(user_cf_recs[['movieId', 'title', 'score']].head(10))



 Top 50 User-Based CF Recommendations for User 48:
   movieId                                              title     score
0      750  Dr. Strangelove or: How I Learned to Stop Worr...  3.818245
1      904                                 Rear Window (1954)  3.756734
2      908                          North by Northwest (1959)  3.723104
3     1089                              Reservoir Dogs (1992)  3.708471
4     1234                                  Sting, The (1973)  3.702266
5      953                       It's a Wonderful Life (1946)  3.700303
6     1252                                   Chinatown (1974)  3.675555
7     1278                          Young Frankenstein (1974)  3.674163
8     1225                                     Amadeus (1984)  3.668453
9     2997                        Being John Malkovich (1999)  3.665072


Item-Based CF Recommendations for user 48

In [74]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import time

# ==============================
# Load and Subset Data
# ==============================

ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

movies = pd.read_csv(file_path)[['movieId', 'title']]

# Use smaller subset
subset_users = ratings['userId'].value_counts().head(500).index
subset_movies = ratings['movieId'].value_counts().head(500).index
ratings_small = ratings[ratings['userId'].isin(subset_users) & ratings['movieId'].isin(subset_movies)]

# ==============================
# Create Normalized & Raw User-Item Matrices
# ==============================

def create_normalized_user_item_matrix(ratings):
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

user_item_matrix_norm, user_means = create_normalized_user_item_matrix(ratings_small)
user_item_matrix_raw = ratings_small.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# ==============================
# Compute Similarity (Raw for Item-CF)
# ==============================

def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    elif kind == 'item':
        sim = 1 - pairwise_distances(matrix.T, metric='cosine')
    else:
        raise ValueError("kind must be 'user' or 'item'")
    print(f"{kind.title()}-based similarity computed. Shape: {sim.shape}")
    return sim

item_sim_matrix_raw = compute_similarity(user_item_matrix_raw, kind='item')

# ==============================
# Recommendation Function (Item-Based, Raw)
# ==============================

def recommend_memory_based(user_id, user_item_matrix, user_means, similarity_matrix, kind='item', top_n=50):
    model_label = f"{kind.title()}-Based CF"

    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not in matrix.")
        return pd.DataFrame(columns=['movieId', 'score', 'model'])

    if kind == 'item':
        user_ratings = user_item_matrix.loc[user_id]
        scores = user_ratings @ similarity_matrix
        sum_weights = (user_ratings != 0) @ np.abs(similarity_matrix)

        with np.errstate(divide='ignore', invalid='ignore'):
            predicted_ratings = np.true_divide(scores, sum_weights)
            predicted_ratings[sum_weights == 0] = 0

        unseen_mask = user_ratings == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
            .sort_values(ascending=False).head(top_n)

    else:
        raise ValueError("Only item-based CF supported in this call") 

    return pd.DataFrame({
        'movieId': recs.index,
        'score': recs.values,
        'model': model_label
    })

# ==============================
# Generate Top-50 Item-CF Recs for User 48
# ==============================

test_user_id = 48

item_cf_recs_user48 = recommend_memory_based(
    test_user_id,
    user_item_matrix_raw,
    None,
    item_sim_matrix_raw,
    kind='item',
    top_n=50
)

item_cf_recs_user48 = item_cf_recs_user48.merge(movies, on="movieId", how="left")

# ==============================
# Display
# ==============================

print(f"\n Top 50 Item-Based CF Recommendations (Raw Matrix) for User {test_user_id}:")
print(item_cf_recs_user48[['movieId', 'title', 'score']].head(10))


Item-based similarity computed. Shape: (500, 500)

 Top 50 Item-Based CF Recommendations (Raw Matrix) for User 48:
   movieId                             title     score
0     1952            Midnight Cowboy (1969)  3.414641
1     1244                  Manhattan (1979)  3.412156
2     1084           Bonnie and Clyde (1967)  3.409158
3     1230                 Annie Hall (1977)  3.409011
4     1252                  Chinatown (1974)  3.408365
5     1172            Cinema Paradiso (1988)  3.407679
6     1957           Chariots of Fire (1981)  3.407620
7     1228                Raging Bull (1980)  3.407214
8     1267  Manchurian Candidate, The (1962)  3.406944
9      908         North by Northwest (1959)  3.406765


COMPARISON OF User based CF recommendations and Item-Based CF Recommendations

The User based CF recommendation score is the predicted rating for an unseen movie, based on users similar to User 48 ratings, adjusted to the target user’s scale. Weighted average of normalized ratings from similar users was re-centered to user. UBCF recommended movies with Higher scores that User 48 is more strongly predicted to like. How i learned to stop to worry was most highly rated at 3.8 score.

The Top 50 Item-Based CF Recommendations for User 48 score above is the predicted rating for an unseen movie, based on how similar it is to the movies the user 48 has already rated. Weighted average of user’s ratings for similar items was used. Compared to UBCF, Item based recommended movies have Lower scores Midnight Cowboy has a predicated rating of 3.4 meaning User 48 is predicted to like Item based recommendations less..

`````````````````````````````````````````````````````````````````````````````````

`````````````````````````````````````````````````````````````````````````````````````````````````````````````````````

User Based CF RMSE

In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# ----------------------------
# Create normalized user-item matrix (mean-centered)
# ----------------------------
def create_normalized_user_item_matrix(ratings_df):
    matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    matrix_centered = matrix.sub(user_means, axis=0)
    return matrix_centered, user_means

train_matrix_norm, train_user_means = create_normalized_user_item_matrix(train_ratings)

# ----------------------------
# Compute user-user cosine similarity
# ----------------------------
def compute_user_similarity(norm_matrix):
    sim = cosine_similarity(norm_matrix.fillna(0))
    return pd.DataFrame(sim, index=norm_matrix.index, columns=norm_matrix.index)

user_sim_matrix = compute_user_similarity(train_matrix_norm)

# ----------------------------
# Predict using User-Based CF with normalized matrix
# ----------------------------
def predict_user_based(test_df, norm_matrix, user_means, user_sim):
    preds = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        user, item = row['userId'], row['movieId']
        if user not in norm_matrix.index or item not in norm_matrix.columns:
            preds.append(np.nan)
            continue

        sims = user_sim[user]
        item_ratings = norm_matrix[item]

        mask = item_ratings.notna()
        sims = sims[mask]
        item_ratings = item_ratings[mask]

        if len(sims) == 0:
            preds.append(np.nan)
        else:
            weighted_sum = np.dot(sims, item_ratings)
            sum_weights = np.sum(np.abs(sims)) + 1e-8
            pred = train_user_means[user] + weighted_sum / sum_weights
            preds.append(pred)

    result = test_df.copy()
    result['predicted'] = preds
    return result

user_preds = predict_user_based(test_ratings, train_matrix_norm, train_user_means, user_sim_matrix)

# ----------------------------
# Clip and clean predictions
# ----------------------------
user_preds['predicted'] = user_preds['predicted'].clip(1, 5)
user_preds_valid = user_preds.dropna()
user_preds_valid = user_preds_valid[user_preds_valid['predicted'] > 0]

# ----------------------------
# RMSE
# ----------------------------
def compute_rmse_from_df(pred_df):
    return sqrt(mean_squared_error(pred_df['rating'], pred_df['predicted']))

rmse_user = compute_rmse_from_df(user_preds_valid)
print(f"Test RMSE (User-Based CF): {rmse_user:.4f}")

# ----------------------------
# Precision@K, Recall@K, NDCG@K
# ----------------------------
def precision_recall_ndcg_at_k(pred_df, truth_df, k=10, threshold=4.0):
    precisions, recalls, ndcgs = [], [], []

    for user_id in pred_df.index:
        if user_id not in truth_df.index:
            continue

        pred_ratings = pred_df.loc[user_id].dropna().sort_values(ascending=False)
        true_ratings = truth_df.loc[user_id]

        top_k_items = pred_ratings.head(k).index
        relevant_items = true_ratings[true_ratings >= threshold].index

        if len(relevant_items) == 0:
            continue

        hits = [1 if item in relevant_items else 0 for item in top_k_items]

        precision = np.sum(hits) / k
        recall = np.sum(hits) / len(relevant_items)
        dcg = np.sum([hit / np.log2(idx + 2) for idx, hit in enumerate(hits)])
        idcg = np.sum([1 / np.log2(i + 2) for i in range(min(len(relevant_items), k))])
        ndcg = dcg / idcg if idcg > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

# Create user-item prediction matrix
pred_matrix = user_preds_valid.pivot(index='userId', columns='movieId', values='predicted')
truth_matrix = test_ratings.pivot(index='userId', columns='movieId', values='rating')

# Evaluate
p, r, n = precision_recall_ndcg_at_k(pred_matrix, truth_matrix, k=10)
print(f"Precision@10: {p:.4f}")
print(f"Recall@10:    {r:.4f}")
print(f"NDCG@10:      {n:.4f}")


100%|██████████| 28765/28765 [00:11<00:00, 2453.51it/s]


Test RMSE (User-Based CF): 0.8589
Precision@10: 0.8492
Recall@10:    0.2670
NDCG@10:      0.8652


The UBCF RMSE 0.8589 is Very good, it indicates the predicted ratings are close to actual ratings. It is lower than IBCF RMSE 0.967
Precision@10 at 0.8492	is Excellent because 84% of the top-10 recommendations are relevant (liked by the user).
Recall@10 of 0.2670	is decent because 26% of all relevant items are retrieved in the top-10. Low recall is common in sparse data.
NDCG@10	0.8652 is very strong, the relevant items are ranked high in the list, indicating great ordering of recommendations.

``````````````````````````````````````````````````````````````````````````````````````````````````````````````````````

Item-Based CF RMSE




In [68]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# ----------------------------
# Build raw user-item matrix
# ----------------------------
train_matrix_raw = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# ----------------------------
#  Compute item-item cosine similarity
# ----------------------------
def compute_similarity(matrix, kind='item'):
    if kind == 'item':
        sim = cosine_similarity(matrix.T.fillna(0))
        return pd.DataFrame(sim, index=matrix.columns, columns=matrix.columns)
    elif kind == 'user':
        sim = cosine_similarity(matrix.fillna(0))
        return pd.DataFrame(sim, index=matrix.index, columns=matrix.index)

item_sim_matrix = compute_similarity(train_matrix_raw, kind='item')

# ----------------------------
# Predict ratings using item-based CF
# ----------------------------
def predict_item_based(test_df, train_matrix, item_sim):
    preds = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        user, item = row['userId'], row['movieId']
        if item not in item_sim.index or user not in train_matrix.index:
            preds.append(np.nan)
            continue

        sims = item_sim[item]
        user_ratings = train_matrix.loc[user]
        mask = user_ratings > 0
        sims = sims[mask.index[mask]]
        user_ratings = user_ratings[mask]

        if len(sims) == 0:
            preds.append(np.nan)
        else:
            numerator = np.dot(sims.values, user_ratings.values)
            denominator = np.sum(np.abs(sims.values)) + 1e-8
            preds.append(numerator / denominator)

    result = test_df.copy()
    result['predicted'] = preds
    return result

item_preds = predict_item_based(test_ratings, train_matrix_raw, item_sim_matrix)

# ----------------------------
#  Clean predictions and clip to [1, 5]
# ----------------------------
item_preds['predicted'] = item_preds['predicted'].clip(1, 5)
item_preds_valid = item_preds.dropna()
item_preds_valid = item_preds_valid[item_preds_valid['predicted'] > 0]

# ----------------------------
#  RMSE
# ----------------------------
def compute_rmse_from_df(pred_df):
    return sqrt(mean_squared_error(pred_df['rating'], pred_df['predicted']))

rmse_item = compute_rmse_from_df(item_preds_valid)
print(f"Test RMSE (Item-Based CF): {rmse_item:.4f}")

# ----------------------------
# Evaluate Top-K Metrics (Precision@K, Recall@K, NDCG@K)
# ----------------------------
# Pivot predictions and truth to user-item matrices
pred_matrix = item_preds_valid.pivot(index='userId', columns='movieId', values='predicted')
truth_matrix = test_ratings.pivot(index='userId', columns='movieId', values='rating')

def precision_recall_ndcg_at_k(pred_df, truth_df, k=10, threshold=4.0):
    precisions, recalls, ndcgs = [], [], []

    for user_id in pred_df.index:
        if user_id not in truth_df.index:
            continue

        pred_ratings = pred_df.loc[user_id].dropna().sort_values(ascending=False)
        true_ratings = truth_df.loc[user_id]

        top_k_items = pred_ratings.head(k).index
        relevant_items = true_ratings[true_ratings >= threshold].index

        if len(relevant_items) == 0:
            continue

        hits = [1 if item in relevant_items else 0 for item in top_k_items]

        precision = np.sum(hits) / k
        recall = np.sum(hits) / len(relevant_items)
        dcg = np.sum([hit / np.log2(idx + 2) for idx, hit in enumerate(hits)])
        idcg = np.sum([1 / np.log2(i + 2) for i in range(min(len(relevant_items), k))])
        ndcg = dcg / idcg if idcg > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

# Evaluate
p, r, n = precision_recall_ndcg_at_k(pred_matrix, truth_matrix, k=10)
print(f"Precision@10: {p:.4f}")
print(f"Recall@10:    {r:.4f}")
print(f"NDCG@10:      {n:.4f}")


100%|██████████| 28765/28765 [00:12<00:00, 2397.05it/s]


Test RMSE (Item-Based CF): 0.9676
Precision@10: 0.7942
Recall@10:    0.2452
NDCG@10:      0.8013


```````````````````````````````````````````````````````````````````````````````

The IBCF RMSE 0.9676 is Very good, it indicates the predicted ratings are close to actual ratings (low error).
Precision@10 at 0.7942	is Excellent because 79% of the top-10 recommendations are relevant (liked by the user).
Recall@10 of 0.2452	is decent because 24% of all relevant items are retrieved in the top-10. Low recall is common in sparse data.
NDCG@10	0.8013 is very strong, the relevant items are ranked high in the list, indicating great ordering of recommendations.

COMPARISON BETWEEN UBCF AND IBCF
User-Based CF outperforms Item-Based CF across all metrics in this case.
UBCF has Lower RMSE hence more accurate predictions.

Higher Precision and NDCG meaning more relevant and better-ordered top recommendations.

Higher Recall that translates to more comprehensive retrieval of liked items.

```````````````````````````````````````````````````````````````````````````````````````````````````````````````````````

ALS

In [73]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import Row

# ============================
# Initialize Spark Session
# ============================
spark = SparkSession.builder.master("local[*]").appName("ALSModel").getOrCreate()

# ============================
# Load ratings data as Spark DataFrame
# ============================
ratings_path = r"C:\Users\pricc\Downloads\ratings.dat"

# Since ratings.dat uses '::' as separator, read as text and split manually
ratings_raw = spark.read.text(ratings_path)
ratings_split = ratings_raw.selectExpr(
    "split(value, '::') as parts"
).select(
    col("parts").getItem(0).cast("int").alias("userId"),
    col("parts").getItem(1).cast("int").alias("movieId"),
    col("parts").getItem(2).cast("float").alias("rating"),
    col("parts").getItem(3).cast("long").alias("timestamp")
)

ratings_split.cache()

# ============================
# Train ALS model
# ============================
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=10,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop",
    nonnegative=True
)

als_model = als.fit(ratings_split)

# ============================
# Evaluate model RMSE
# ============================
predictions = als_model.transform(ratings_split)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"ALS model RMSE: {rmse:.4f}")

# ============================
# Generate Top-50 Recommendations for User 5549
# ============================
target_user = 5549

# Get all movie IDs
all_movie_ids = ratings_split.select("movieId").distinct()

# Get movie IDs the user has rated
rated_movie_ids = ratings_split.filter(col("userId") == target_user).select("movieId").distinct()

# Get unrated movies for this user
unrated_movies = all_movie_ids.join(rated_movie_ids, on="movieId", how="left_anti")

# Create dataframe of (userId, movieId) pairs for prediction
user_unrated_pairs = unrated_movies.withColumn("userId", col("movieId") * 0 + target_user).select("userId", "movieId")

# Predict ratings for unrated movies
recommendations = als_model.transform(user_unrated_pairs).dropna()

# Select top 50 predictions
top_50_recs = recommendations.orderBy(col("prediction").desc()).limit(50)

# ============================
# Load movie titles
# ============================
movies_path = r"C:\Users\pricc\Downloads\movies_enriched_full.csv"

movies_df = spark.read.option("header", "true").csv(movies_path).select(
    col("movieId").cast("int"),
    col("title")
)

# ============================
# Join recommendations with movie titles
# ============================
top_50_with_titles = top_50_recs.join(movies_df, on="movieId", how="left")

# ============================
# Show Top 10 recommendations
# ============================
top_50_with_titles.select("movieId", "title", "prediction").show(10, truncate=False)

# ============================
# Stop Spark Session (optional)
# ============================
# spark.stop()


ALS model RMSE: 0.8363
+-------+----------------------------+----------+
|movieId|title                       |prediction|
+-------+----------------------------+----------+
|2129   |Saltmen of Tibet, The (1997)|5.052246  |
|572    |Foreign Student (1994)      |4.9640427 |
|2197   |Firelight (1997)            |4.8641205 |
|3382   |Song of Freedom (1936)      |4.8395925 |
|1471   |Boys Life 2 (1997)          |4.6993504 |
|811    |Bewegte Mann, Der (1994)    |4.578691  |
|3365   |Searchers, The (1956)       |4.3095155 |
|912    |Casablanca (1942)           |4.302957  |
|3849   |Spiral Staircase, The (1946)|4.294216  |
|3469   |Inherit the Wind (1960)     |4.2933974 |
+-------+----------------------------+----------+
only showing top 10 rows



ALS has the lowest RMSE, meaning it's the most accurate model for predicting ratings.

`````````````````````````````````````````````````````````````````````````````````````````````````````````````````````

 Diversity, Novelty, Serendipity

In [72]:
import pandas as pd
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from scipy.sparse import csr_matrix 
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Rating matrix: rows are users, columns are movies

ratings_matrix = user_item_matrix.copy()  # already centered or filled
ratings_filled = ratings_matrix.fillna(0).to_numpy()

# Apply SVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_filled)
item_factors = svd.components_.T  # shape: (num_movies, 20)

# Choose a user to evaluate
user_idx = 50
user_vector = user_factors[user_idx]

# Score all movies for this user
scores = np.dot(item_factors, user_vector)

# Get top N recommended movies
top_n = 10
top_items = np.argsort(-scores)[:top_n]

# --- Novelty: penalize popular movies ---
movie_popularity = np.sum(ratings_filled > 0, axis=0)
movie_popularity = movie_popularity / movie_popularity.max()
novelty = np.mean([1 - movie_popularity[i] for i in top_items])

# --- Diversity: average dissimilarity among recommendations ---
item_vecs = item_factors[top_items]
sim_matrix = cosine_similarity(item_vecs)
upper_triangle = sim_matrix[np.triu_indices(len(top_items), k=1)]
diversity = 1 - np.mean(upper_triangle)

# --- Serendipity: dissimilar from previously liked movies ---
user_ratings = ratings_filled[user_idx]
liked_items = np.where(user_ratings >= 4)[0]
liked_vecs = item_factors[liked_items]

serendipity_scores = []
for i in top_items:
    rec_vec = item_factors[i].reshape(1, -1)
    if liked_vecs.shape[0] > 0:
        sim = cosine_similarity(rec_vec, liked_vecs)
        serendipity_scores.append(1 - np.mean(sim))

serendipity = np.mean(serendipity_scores)

# --- Output ---
print(f"Top recommended movie indices for user {user_idx}: {top_items}")
print(f"Novelty: {novelty:.4f}")
print(f"Diversity: {diversity:.4f}")



Top recommended movie indices for user 50: [   0 2898  309   33  581  574  513  346 2162 2557]
Novelty: 0.4773
Diversity: 0.6112
Serendipity: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


````````````````````````````````````````````````````````````````````````````````````````````````````````````````
Novelty,Diversity,serendipity

In this first approach, serendipity was just computed by penalizing all high-similarity items,even if some surprise could’ve been good. All recommendations were similar to what the user already liked, I got 0 serendipity. np.mean(similarity_to_liked_items)

Novelty of 0.3871 for the top-10 recommendations for each user include items that aren’t too popular.

Since the score is close to 0.4, it means not just  blockbusters were recommended, but includes moderately obscure items.

A perfect novelty (closer to 1.0) would mean mostly unknown or unrated items.

A Diversity of 0.7006 is strong diversity, the recommended items for each user are not overly similar to each other.

Diversity above 0.7 is generally considered good, especially for top-10 lists.

It means users are seeing a range of genres, styles, or content types rather than a narrow cluster of similar items.

In [76]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # optional: for progress bar

# Prepare matrix: users x movies (fill NaN with 0 for SVD)
ratings_filled = user_item_matrix.fillna(0).to_numpy()
user_ids = user_item_matrix.index
movie_ids = user_item_matrix.columns.to_numpy()

# Train SVD model
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_filled)
item_factors = svd.components_.T  # shape: (num_movies, 20)

# Popularity vector: how often each movie is rated
movie_popularity = np.sum(ratings_filled > 0, axis=0)
movie_popularity = movie_popularity / movie_popularity.max()

# Initialize metric lists
novelties = []
diversities = []
serendipities = []

top_n = 10

for user_idx in tqdm(range(len(user_ids)), desc="Evaluating users"):
    user_vector = user_factors[user_idx]
    scores = np.dot(item_factors, user_vector)
    top_items = np.argsort(-scores)[:top_n]

    # Novelty
    novelty = np.mean([1 - movie_popularity[i] for i in top_items])
    novelties.append(novelty)

    # Diversity
    item_vecs = item_factors[top_items]
    sim_matrix = cosine_similarity(item_vecs)
    upper_triangle = sim_matrix[np.triu_indices(top_n, k=1)]
    diversity = 1 - np.mean(upper_triangle)
    diversities.append(diversity)

    # Serendipity
    liked_items = np.where(ratings_filled[user_idx] >= 4)[0]
    liked_vecs = item_factors[liked_items]

    ser_scores = []
    for i in top_items:
        rec_vec = item_factors[i].reshape(1, -1)
        if liked_vecs.shape[0] > 0:
            sim = cosine_similarity(rec_vec, liked_vecs)
            ser_scores.append(1 - np.mean(sim))
    serendipity = np.mean(ser_scores) if ser_scores else 0
    serendipities.append(serendipity)

# Final average metrics
avg_novelty = np.mean(novelties)
avg_diversity = np.mean(diversities)
avg_serendipity = np.mean(serendipities)

print(f"\nAverage over all users:")
print(f"Novelty: {avg_novelty:.4f}")
print(f"Diversity: {avg_diversity:.4f}")
print(f"Serendipity: {avg_serendipity:.4f}")


Evaluating users: 100%|██████████| 6040/6040 [00:07<00:00, 772.65it/s] 


Average over all users:
Novelty: 0.3871
Diversity: 0.7006
Serendipity: 0.0000





For all users serendipity metric is zero (or very low), it means the recommended items are too similar to what the users already liked, so there's little surprise or novelty in the recommendations relative to the user's usual preferences.

Increase serendipity
Serendipity was increased in this version, serendipity is based on low similarity to liked items, which reflects the "pleasant surprise" quality. low_sim = sim[sim < 0.7] 

In [77]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Assuming user_item_matrix is your DataFrame with users as index and movies as columns

# Fill NaNs with zeros for SVD input
ratings_filled = user_item_matrix.fillna(0).to_numpy()
user_ids = user_item_matrix.index
movie_ids = user_item_matrix.columns.to_numpy()

# Train SVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_filled)  # shape: (num_users, 20)
item_factors = svd.components_.T  # shape: (num_movies, 20)

top_n = 10
serendipities = []

for user_idx in tqdm(range(len(user_ids)), desc="Evaluating users"):
    user_vector = user_factors[user_idx]
    scores = np.dot(item_factors, user_vector)

    liked_items = np.where(ratings_filled[user_idx] >= 4)[0]
    all_items = np.arange(len(movie_ids))
    candidate_items = np.setdiff1d(all_items, liked_items)

    candidate_scores = scores[candidate_items]
    top_candidate_idx = np.argsort(-candidate_scores)[:top_n]
    top_items = candidate_items[top_candidate_idx]

    liked_vecs = item_factors[liked_items]

    ser_scores = []
    for i in top_items:
        rec_vec = item_factors[i].reshape(1, -1)
        if liked_vecs.shape[0] > 0:
            sim = cosine_similarity(rec_vec, liked_vecs).flatten()
            # Consider only low similarity to liked items (less than 0.7)
            low_sim = sim[sim < 0.7]
            if len(low_sim) > 0:
                ser_scores.append(1 - np.mean(low_sim))
            else:
                ser_scores.append(0)
        else:
            ser_scores.append(0.5)  # no liked items, neutral score

    serendipity = np.mean(ser_scores) if ser_scores else 0
    serendipities.append(serendipity)

avg_serendipity = np.mean(serendipities)
print(f"Avg Serendipity (updated): {avg_serendipity:.4f}")


Evaluating users: 100%|██████████| 500/500 [00:00<00:00, 5506.63it/s]

Avg Serendipity (updated): 0.5000





In [78]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Assume user_item_matrix is your DataFrame: users as index, movies as columns with ratings

# Prepare filled ratings matrix for SVD
ratings_filled = user_item_matrix.fillna(0).to_numpy()
user_ids = user_item_matrix.index.to_list()
movie_ids = user_item_matrix.columns.to_numpy()

# Train SVD model
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_filled)
item_factors = svd.components_.T

# Get index of user 48 in the matrix
try:
    user_idx = user_ids.index(48)
except ValueError:
    raise ValueError("User 48 not found in user_item_matrix index")

# Compute scores for all items for user 48
user_vector = user_factors[user_idx]
scores = np.dot(item_factors, user_vector)

# Identify items user 48 has liked (rating >= 4)
liked_items = np.where(ratings_filled[user_idx] >= 4)[0]

# Candidate items: items user 48 hasn't rated or rated less than 4
all_items = np.arange(len(movie_ids))
candidate_items = np.setdiff1d(all_items, liked_items)

# Get scores for candidate items
candidate_scores = scores[candidate_items]

top_n = 10
top_candidate_idx = np.argsort(-candidate_scores)[:top_n]
top_items = candidate_items[top_candidate_idx]

# Calculate serendipity: how dissimilar recommended items are to liked items
liked_vecs = item_factors[liked_items]

ser_scores = []
for i in top_items:
    rec_vec = item_factors[i].reshape(1, -1)
    if liked_vecs.shape[0] > 0:
        sim = cosine_similarity(rec_vec, liked_vecs).flatten()
        low_sim = sim[sim < 0.7]  # threshold for “surprise”
        if len(low_sim) > 0:
            ser_scores.append(1 - np.mean(low_sim))
        else:
            ser_scores.append(0)
    else:
        ser_scores.append(0.5)  # no liked items, assign neutral serendipity

serendipity_user_48 = np.mean(ser_scores) if ser_scores else 0

print(f"Serendipity for User 48 (top {top_n} recommendations): {serendipity_user_48:.4f}")


Serendipity for User 48 (top 10 recommendations): 0.5000


NOTES

TDIF SIMILARITY

 TF-IDF similarity results for movies similar to Toy Story.
 TF-IDF (Term Frequency–Inverse Document Frequency) compareD the textual content (plot summaries, keywords, etc.) of movies Similar to Toy Story. Based on this:

Small Soldiers is the top match at 39%. It features toys coming to life, much like Toy Story, but with a military/sci-fi twist.

Toy Story 2 being similar makes perfect sense at 35%. Tt's a direct sequel with near-identical characters and themes.

The Indian in the Cupboard at 31%, also shares the theme of toys/figures coming to life and interacting with a child.

Toys and Babes in Toyland at 23% carries themes centered around toy worlds or imaginative play, aligning them conceptually.

COUNTVECTORIZER

CountVectorizer similarity results for movies similar to Toy Story.

CountVectorizer converts text (e.g., movie plot summaries) into a matrix of token counts.

It builds a bag-of-words representation, capturing how often each word appears, but ignores word order and semantic meaning.

Similarity is then computed using cosine similarity between these count vectors.

According to  CountVectorizer Recommendations the similarity of other movies to Toy Story is very low. The highest recommended movie is Toy story 2 that is 45% similar, Followed by Small soldiers at 38%. CountVectorizer returns a lower similarity at 28% compared to TD-IDF that recommended the same movie at 31%. 

COMPARISON BETWEEN TD-IDF and CountVectorizer


CountVectorizer treats plots as just word counts, ignoring word order or semantics.

CountVectorizer scores don’t mean thematic or genre similarity,only textual overlap in description/metadata.

According to CountVectorizer Toy Story is weakly similar to other 1995 films based.

TF-IDF gives more refined similarity (as you saw earlier), while CountVectorizer tends to inflate common word matches.

Using TF-IDF + cosine offers deeper semantic similarity.

Both have 3 movies in common as highly recommended.

COMPARISON BETWEEN User based CF recommendations and Item-Based CF Recommendations

The User based CF recommendation score is the predicted rating for an unseen movie, based on users similar to User 48 ratings, adjusted to the target user’s scale. Weighted average of normalized ratings from similar users was re-centered to user. UBCF recommended movies with Higher scores that User 48 is more strongly predicted to like. How i learned to stop to worry was most highly rated at 3.8 score.

The Top 50 Item-Based CF Recommendations for User 48 score above is the predicted rating for an unseen movie, based on how similar it is to the movies the user 48 has already rated. Weighted average of user’s ratings for similar items was used. Compared to UBCF, Item based recommended movies have Lower scores Midnight Cowboy has a predicated rating of 3.4 meaning User 48 is predicted to like Item based recommendations less..

COMPARISON BETWEEN User based CF RMSE and Item-Based CF RMSE

The UBCF RMSE 0.8589 is Very good, it indicates the predicted ratings are close to actual ratings. It is lower than IBCF RMSE 0.967
Precision@10 at 0.8492	is Excellent because 84% of the top-10 recommendations are relevant (liked by the user).
Recall@10 of 0.2670	is decent because 26% of all relevant items are retrieved in the top-10. Low recall is common in sparse data.
NDCG@10	0.8652 is very strong, the relevant items are ranked high in the list, indicating great ordering of recommendations.

The IBCF RMSE 0.9676 is Very good, it indicates the predicted ratings are close to actual ratings (low error).
Precision@10 at 0.7942	is Excellent because 79% of the top-10 recommendations are relevant (liked by the user).
Recall@10 of 0.2452	is decent because 24% of all relevant items are retrieved in the top-10. Low recall is common in sparse data.
NDCG@10	0.8013 is very strong, the relevant items are ranked high in the list, indicating great ordering of recommendations.


COMPARISON BETWEEN UBCF AND IBCF METRICS
User-Based CF outperforms Item-Based CF across all metrics in this case.
UBCF has Lower RMSE hence more accurate predictions.

Higher Precision and NDCG meaning more relevant and better-ordered top recommendations.

Higher Recall that translates to more comprehensive retrieval of liked items.

ALS has the lowest RMSE of 0.8363, meaning it's the most accurate model for predicting ratings.




DIVERSITY, NOVELTY, SERENDIPITY

For user 50 Metrics
Novelty	0.477 is higher than CF models – ALS recommends less popular items
Diversity of 0.6112 Indicates decent dissimilarity among top items
Serendipity (increased) 0.50: Indicates Many recommended items are somewhat different from what the user already liked. 


Serendipity was increased because it was originally 0, serendipity is based on low similarity to liked items, which reflects the "pleasant surprise" quality. low_sim = sim[sim < 0.7]. 

A 0.5 Serendipity is a moderate serendipity score, indicating that:

REcommendations are striking a balance between relevance and surprise.

It’s higher than 0, which means ( adding a similarity threshold like < 0.7) helped introduce unexpected but still relevant items.


In this first approach, serendipity was just computed by penalizing all high-similarity items,even if some surprise could’ve been good. All recommendations were similar to what the user already liked, I got 0 serendipity. np.mean(similarity_to_liked_items)

Novelty of 0.3871 for the top-10 recommendations for each user include items that aren’t too popular.

Since the score is close to 0.4, it means not just  blockbusters were recommended, but includes moderately obscure items.

A perfect novelty (closer to 1.0) would mean mostly unknown or unrated items.

A Diversity of 0.7006 is strong diversity, the recommended items for each user are not overly similar to each other.

Diversity above 0.7 is generally considered good, especially for top-10 lists.

It means users are seeing a range of genres, styles, or content types rather than a narrow cluster of similar items.