In [1]:
#Name:    Shubham Srivastava 
#Email:   ss253@illinois.edu
#UIN:     678342883
#Program: MCS-DS
#Team:    Individual

## System I: Recommendation Based on Popularity
"Popularity" is a very open ended term that can be defined with varying levels of complexity. A simplistic approach would be to pick the top movies by the highest average rating. However, this can be misleading if a movie has very few ratings, as a single high rating could push it to the top. On the other hand, selecting movies solely by the number of ratings might bias toward older, more widely-seen titles, even if their quality is average.

Another way is to combine both the number of ratings and the average rating. For example, we can take the following steps:

<ol>
<li>Set a minimum number of ratings threshold: Only consider movies that have been rated by at least N users (e.g., 30% of total users in the dataset). This ensures that the movie’s rating is robust and not driven by a small, potentially unrepresentative sample of users.</li>
<li>Use the average rating to determine popularity among those widely rated movies: Among the movies that meet the threshold, rank them by their average rating in descending order.</li>
</ol>

This approach ensures that our recommended "most popular" movies are both widely viewed and highly rated, striking a balance between quantity (broad appeal) and quality (high rating).

In [2]:
import pandas as pd
from IPython.display import HTML, display
import numpy as np
from tqdm import tqdm

# ============================================================
# System I: Recommendation Based on Popularity
# ============================================================

print("=== System I: Recommendation Based on Popularity ===")
print("Reading ratings and movies data...")

ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', encoding='ISO-8859-1', header=None)
movies.columns = ['MovieID', 'Title', 'Genres']

print("Computing rating counts and average rating per movie...")
movie_stats = ratings.groupby('MovieID').agg(
    count_ratings=('Rating', 'count'),
    avg_rating=('Rating', 'mean')
).reset_index()

print("Defining popularity threshold: Movies viewed by at least 30% of total users.")
total_users = 6040
rating_threshold = 0.3 * total_users

popular_movies = movie_stats[movie_stats['count_ratings'] >= rating_threshold]
print("Sorting by average rating in descending order...")
popular_movies = popular_movies.sort_values('avg_rating', ascending=False)

print("Merging with movie titles...")
popular_movies = popular_movies.merge(movies, on='MovieID', how='left')

print("Saving popularity ranking to CSV (popular_movies.csv) so we can avoid recomputing...")
popular_movies.to_csv("popular_movies.csv", index=False)

# Selecting top 10 popular movies
print("Selecting top 10 movies based on defined popularity criteria...")
top_10 = popular_movies.head(10)

# Adding ImageURL and FormattedID columns
top_10 = top_10.copy()
top_10.loc[:, 'ImageURL'] = top_10['MovieID'].apply(lambda x: f"https://liangfgithub.github.io/MovieImages/{x}.jpg")
top_10.loc[:, 'FormattedID'] = top_10['MovieID'].apply(lambda x: f"m{x}")

print("Displaying the top 10 popular movies with their images...")
html = """
<style>
.movie-grid {
    display: flex;
    flex-wrap: wrap;
    gap: 20px;
    margin: 20px 0;
}
.movie-card {
    border: 1px solid #ccc;
    padding: 10px;
    width: 180px;
    text-align: center;
    font-family: sans-serif;
    border-radius: 5px;
}
.movie-card img {
    max-width: 100%;
    border-radius: 5px;
}
.movie-card-title {
    font-weight: bold;
    margin: 10px 0 5px;
    font-size: 14px;
}
.movie-card-id {
    color: #555;
    font-size: 12px;
    margin-bottom: 5px;
}
.rating-container {
    font-size: 0;
    display: inline-block;
    margin: 5px 0;
}
.stars-outer {
  display: inline-block;
  position: relative;
  font-family: Arial, sans-serif;
}
.stars-outer::before {
  content: "★★★★★";
  font-size: 16px;
  color: #ccc;
}
.stars-inner {
  position: absolute;
  top: 0;
  left: 0;
  white-space: nowrap;
  overflow: hidden;
  color: #f39c12;
  font-size: 16px;
}
.stars-inner::before {
  content: "★★★★★";
}
.avg-rating-text {
    font-size: 12px;
    margin-left: 5px;
    vertical-align: middle;
    color: #333;
}
</style>
<div class="movie-grid">
"""

for _, row in top_10.iterrows():
    star_percentage = (row['avg_rating'] / 5.0) * 100
    html += f"""
    <div class="movie-card">
        <img src="{row['ImageURL']}" alt="{row['Title']} poster">
        <div class="movie-card-title">{row['Title']}</div>
        <div class="movie-card-id">{row['FormattedID']}</div>
        <div class="rating-container">
            <div class="stars-outer">
                <div class="stars-inner" style="width: {star_percentage}%"></div>
            </div>
            <span class="avg-rating-text">{row['avg_rating']:.1f}/5</span>
        </div>
    </div>
    """

html += "</div>"
display(HTML(html))

=== System I: Recommendation Based on Popularity ===
Reading ratings and movies data...
Computing rating counts and average rating per movie...
Defining popularity threshold: Movies viewed by at least 30% of total users.
Sorting by average rating in descending order...
Merging with movie titles...
Saving popularity ranking to CSV (popular_movies.csv) so we can avoid recomputing...
Selecting top 10 movies based on defined popularity criteria...
Displaying the top 10 popular movies with their images...


In [3]:
popular_movies = movie_stats[movie_stats['count_ratings'] >= rating_threshold]

total_users = 6040
thresholds = [0.30, 0.20, 0.10, 0.05, 0.0]  # progressively relax thresholds
selected_movies = pd.DataFrame()

for t in thresholds:
    coverage_threshold = t * total_users
    # Filter movies at this threshold (that are not already selected)
    candidate_movies = movie_stats[movie_stats['MovieID'].isin(movies['MovieID'])]
    candidate_movies = candidate_movies[~candidate_movies['MovieID'].isin(selected_movies['MovieID'] if not selected_movies.empty else [])]
    candidate_movies = candidate_movies[candidate_movies['count_ratings'] >= coverage_threshold]
    candidate_movies = candidate_movies.sort_values('avg_rating', ascending=False)
    candidate_movies = candidate_movies.merge(movies, on='MovieID', how='left')

    # Append these candidates to our selected_movies
    selected_movies = pd.concat([selected_movies, candidate_movies], ignore_index=True)

    if len(selected_movies) >= 100:
        # We have at least 100 movies, let's just take the top 100 overall
        selected_movies = selected_movies.sort_values('avg_rating', ascending=False).head(100)
        break

# At this point selected_movies has our top 100 movies based on tiered tie-breaking
selected_movies.to_csv("top_100_movies.csv", index=False)

### System II: Recommendation Based on IBCF

In [None]:
# ============================================================
# System II: Recommendation Based on IBCF
# ============================================================

print("\n=== System II: Recommendation Based on Item-Based Collaborative Filtering (IBCF) ===")

print("Step 1: Reading the raw 6040-by-3706 rating matrix R from CSV...")
df = pd.read_csv("I-w9Wo-HSzmUGNNHw0pCzg_bc290b0e6b3a45c19f62b1b82b1699f1_Rmat.csv", header=0, index_col=0)
df.replace('NA', np.nan, inplace=True)

print("Step 1: Normalizing the rating matrix by centering each row (subtracting row means)...")
row_means = df.mean(axis=1)
df_centered = df.sub(row_means, axis=0)
print("Displaying a portion of the row-centered rating matrix:")
print(df_centered.head())

R = df_centered.values
num_users, num_movies = R.shape
print(f"Number of users: {num_users}, Number of movies: {num_movies}")

print("Step 2: Computing the (transformed) Cosine similarity among the 3706 movies...")
print("We only consider pairs of movies that have more than two common raters (|Iij| > 2).")
print("Similarity formula: S_ij = 0.5 + 0.5 * cos_ij, ensuring similarities are between [0,1].")
S = np.full((num_movies, num_movies), np.nan)

for i in tqdm(range(num_movies)):
    for j in range(i+1, num_movies):
        valid_users = ~np.isnan(R[:, i]) & ~np.isnan(R[:, j])
        Iij = np.where(valid_users)[0]
        
        if len(Iij) > 2:
            Ri = R[Iij, i]
            Rj = R[Iij, j]
            numerator = np.sum(Ri * Rj)
            denom_i = np.sqrt(np.sum(Ri**2))
            denom_j = np.sqrt(np.sum(Rj**2))
            denominator = denom_i * denom_j
            if denominator != 0:
                cos_ij = numerator / denominator
                S_ij = 0.5 + 0.5 * cos_ij
                S[i, j] = S_ij
                S[j, i] = S_ij

S_df = pd.DataFrame(S, index=df_centered.columns, columns=df_centered.columns)
print("Displaying the first 5 rows of the similarity matrix S:")
print(S_df.head())

# Now display the pairwise similarity values from the full S matrix for the specified movies
specified_movies = ["m1", "m10", "m100", "m1510", "m260", "m3212"]

print("\nDisplaying pairwise similarity values from the FULL S matrix for:")
print(specified_movies)

# Check if all specified movies are in S_df
for movie in specified_movies:
    if movie not in S_df.index:
        print(f"Warning: {movie} not found in the full S_df matrix.")

# Extract the subset of S_df for these movies
subset_full = S_df.loc[specified_movies, specified_movies]

# Round to 7 decimal places
subset_full_rounded = subset_full.round(7)

print("Pairwise similarity values (rounded to 7 decimal places) from the FULL S matrix:")
print(subset_full_rounded)

print("Now we will keep the top 100 non-NA similarities per movie for use in application.")
for i in tqdm(range(num_movies), desc="Processing rows for top-100"):
    row = S_df.iloc[i, :]
    non_na_values = row.dropna().sort_values(ascending=False)
    top_100 = non_na_values.iloc[:100]
    new_row = pd.Series(np.nan, index=S_df.columns)
    new_row[top_100.index] = top_100
    S_df.iloc[i, :] = new_row

print("Saving the top 100 similarity matrix to S_top100.csv")
S_df.to_csv("S_top100.csv")

print("From S_top100.csv, we will now create a top-30 similarity matrix for testing myIBCF function.")
S_100 = pd.read_csv("S_top100.csv", index_col=0)

for i in tqdm(range(num_movies), desc="Processing rows for top-30"):
    row = S_100.iloc[i, :]
    non_na_values = row.dropna().sort_values(ascending=False)
    top_30 = non_na_values.iloc[:30]  # Keep top 30
    new_row = pd.Series(np.nan, index=S_100.columns)
    new_row[top_30.index] = top_30
    S_100.iloc[i, :] = new_row

print("Saving the top 30 similarity matrix to S_top30.csv")
S_100.to_csv("S_top30.csv")

print("\nWe will use S_top30.csv for the myIBCF function.")

print("Loading rating matrix R for testing myIBCF...")
R = pd.read_csv("I-w9Wo-HSzmUGNNHw0pCzg_bc290b0e6b3a45c19f62b1b82b1699f1_Rmat.csv", index_col=0)
R.replace('NA', np.nan, inplace=True)
R = R.astype(float)

print("Loading the popularity ranking for fallback recommendations...")
popularity_df = pd.read_csv("popular_movies.csv")
# Extract a simple popularity ranking (MovieID in order of popularity)
# Here, popularity is already sorted. We'll save as popularity_ranking.csv for the function.
popularity_df['movieID'] = popularity_df['MovieID'].apply(lambda x: f"m{x}")
popularity_df[['movieID']].to_csv("popularity_ranking.csv", index=False)

# myIBCF function definition
print("\nDefining myIBCF function:")
def myIBCF(newuser, 
           similarity_file="S_top30.csv", 
           popularity_file="popularity_ranking.csv", 
           top_n=10):
    # newuser is a Series of length 3706 with index as movie IDs ("mX"), values are ratings or NA.
    print("\n--- myIBCF Function Execution ---")
    print("Loading similarity matrix from:", similarity_file)
    S = pd.read_csv(similarity_file, index_col=0)

    if not isinstance(newuser, pd.Series):
        newuser = pd.Series(newuser, index=S.columns)

    print("Loading popularity ranking from:", popularity_file)
    popularity_df = pd.read_csv(popularity_file)
    popular_movies = popularity_df['movieID'].tolist()

    rated_mask = ~newuser.isna()
    rated_movies = newuser.index[rated_mask]
    unrated_movies = newuser.index[~rated_mask]
    predictions = pd.Series(np.nan, index=unrated_movies)

    print("Computing predictions for unrated movies...")
    for movie_i in unrated_movies:
        sims = S.loc[movie_i, :]
        neighbors = sims.dropna().index
        rated_neighbors = [m for m in neighbors if m in rated_movies]
        if len(rated_neighbors) == 0:
            continue
        s_ij = sims.loc[rated_neighbors].values
        w_j = newuser.loc[rated_neighbors].values
        denom = np.sum(s_ij)
        if denom == 0:
            continue
        predicted_rating = np.sum(s_ij * w_j) / denom
        predictions[movie_i] = predicted_rating

    predictions_sorted = predictions.dropna().sort_values(ascending=False)

    print("Sorting predictions in descending order...")
    num_predictions = len(predictions_sorted)
    if num_predictions < top_n:
        # Fill with popularity
        recommended_movies = list(predictions_sorted.index)
        needed = top_n - num_predictions
        already_considered = set(rated_movies).union(set(recommended_movies))
        print(f"Fewer than {top_n} predictions available. Using popularity fallback to fill the remainder.")
        for pm in popular_movies:
            if pm not in already_considered:
                recommended_movies.append(pm)
                if len(recommended_movies) == top_n:
                    break
    else:
        recommended_movies = list(predictions_sorted.index[:top_n])

    print("myIBCF completed.")
    print("Top 10 recommended movies are:")
    print(recommended_movies)
    return predictions_sorted, recommended_movies

# ============================================================
# Testing myIBCF
# ============================================================

print("\n=== Testing myIBCF with user 'u1181' ===")
w_u1181 = R.loc["u1181"]
top_movies_u1181, recommendations_u1181 = myIBCF(w_u1181)

print("Top predicted ratings for user u1181 (showing top 10):")
print(top_movies_u1181.head(10))
print("\nTop 10 recommendations for user u1181:")
print(recommendations_u1181)

print("\n=== Testing myIBCF with hypothetical user who rated m1613=5 and m1755=4 ===")
w_hyp = pd.Series(np.nan, index=S_100.columns)
w_hyp["m1613"] = 5
w_hyp["m1755"] = 4
top_movies_hyp, recommendations_hyp = myIBCF(w_hyp)

print("Top predicted ratings for hypothetical user (m1613=5, m1755=4) (showing top 10):")
print(top_movies_hyp.head(10))
print("\nTop 10 recommendations for hypothetical user:")
print(recommendations_hyp)


=== System II: Recommendation Based on Item-Based Collaborative Filtering (IBCF) ===
Step 1: Reading the raw 6040-by-3706 rating matrix R from CSV...
Step 1: Normalizing the rating matrix by centering each row (subtracting row means)...
Displaying a portion of the row-centered rating matrix:
             m1  m10  m100  m1000  m1002  m1003  m1004  m1005  m1006  m1007  \
u1     0.811321  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
u10    0.885287  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
u100        NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
u1000  0.869048  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
u1001  0.347480  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   

       ...  m99  m990  m991  m992  m993  m994  m996  m997  m998  m999  
u1     ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
u10    ...  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
u100   ...  NaN

 13%|██████████▍                                                                    | 489/3706 [00:35<03:30, 15.25it/s]