# Movie Recommender System Report Reproduction

In [2]:
import pandas as pd
import numpy as np
import os

# Constants
MIN_COMMON_RATINGS = 3
TOP_K = 30

## Data Loading

In [7]:
def load_data(data_path=''):
    movies = pd.read_csv(
        os.path.join(data_path, 'movies.dat'),
        sep='::',
        header=None,
        names=['MovieID', 'Title', 'Genres'],
        engine='python',
        encoding='latin-1'
    )

    ratings = pd.read_csv(
        os.path.join(data_path, 'ratings.dat'),
        sep='::',
        header=None,
        names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
        engine='python',
        encoding='latin-1'
    )

    users = pd.read_csv(
        os.path.join(data_path, 'users.dat'),
        sep='::',
        header=None,
        names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
        engine='python',
        encoding='latin-1'
    )

    max_user_id = ratings['UserID'].max()  
    max_movie_id = ratings['MovieID'].max()  

    NUM_USERS = max_user_id
    NUM_MOVIES = max_movie_id

    R = np.full((NUM_USERS, NUM_MOVIES), np.nan)

    for row in ratings.itertuples():
        u = row.UserID - 1
        m = row.MovieID - 1
        R[u, m] = row.Rating

    R_df = pd.DataFrame(
        R,
        index=['u' + str(i) for i in range(1, NUM_USERS + 1)],
        columns=['m' + str(j) for j in range(1, NUM_MOVIES + 1)]
    )

    return movies, users, R_df


In [9]:
movies, users, R_df = load_data()

## System I: Popularity-Based Recommendation

In [13]:
def define_popularity(R_df):
    count_ratings = R_df.notna().sum(axis=0)
    avg_ratings = R_df.mean(axis=0, skipna=True)
    popularity_score = count_ratings * avg_ratings
    popularity_ranking = popularity_score.sort_values(ascending=False)
    return popularity_ranking

def get_top_popular_movies(R_df, movies, top_n=10):
    popularity_ranking = define_popularity(R_df)
    top_movies = popularity_ranking.head(top_n).index
    top_info = movies.set_index('MovieID').loc[[int(x[1:]) for x in top_movies]]
    top_info = top_info[['Title']]
    top_info['MovieID_str'] = top_movies
    return top_info

print("Top 10 Popular Movies:")
top_10_popular = get_top_popular_movies(R_df, movies, top_n=10)
display(top_10_popular)

## System II: IBCF

In [16]:
def center_rows(R):
    R_centered = R.copy()
    means = np.nanmean(R_centered, axis=1)
    for i in range(R_centered.shape[0]):
        row_mask = ~np.isnan(R_centered[i,:])
        R_centered[i,row_mask] = R_centered[i,row_mask] - means[i]
    return R_centered, means

R_matrix = R_df.values
R_centered, row_means = center_rows(R_matrix)

def compute_similarity(R_centered):
    num_users, num_movies = R_centered.shape
    norms = np.sqrt(np.nansum(R_centered**2, axis=0))
    S = np.full((num_movies, num_movies), np.nan)
    
    for i in range(num_movies):
        Ri = R_centered[:, i]
        valid_i = ~np.isnan(Ri)
        for j in range(num_movies):
            if j == i:
                continue
            Rj = R_centered[:, j]
            valid_j = ~np.isnan(Rj)
            both = valid_i & valid_j
            if np.sum(both) >= MIN_COMMON_RATINGS:
                num = np.nansum(Ri[both]*Rj[both])
                den = norms[i]*norms[j]
                if den > 0:
                    cos_ij = num/den
                    sij = (1+cos_ij)/2
                    S[i,j] = sij

    for i in range(num_movies):
        row = S[i,:]
        non_na_idx = np.where(~np.isnan(row))[0]
        if len(non_na_idx) > TOP_K:
            top_idx = non_na_idx[np.argsort(row[non_na_idx])[-TOP_K:]]
            mask = np.ones(len(row), dtype=bool)
            mask[top_idx] = False
            S[i, mask] = np.nan
    return S

S = compute_similarity(R_centered)

In [17]:
movies_to_display = ["m1", "m10", "m100", "m1510", "m260", "m3212"]
print("Showing similarity values for selected movies:")
for mv in movies_to_display:
    idx = int(mv[1:]) - 1
    sim_row = pd.Series(S[idx,:], index=R_df.columns).round(7)
    non_na_sim = sim_row.dropna().sort_values(ascending=False).head(10)
    print(f"Similarities for {mv}:")
    display(non_na_sim)

### IBCF Prediction Function

In [19]:
popularity_ranking = define_popularity(R_df)

def myIBCF(newuser, S, R_df, popularity_ranking, top_n=10):
    w = newuser.copy()
    NUM_MOVIES = R_df.shape[1]
    rated_mask = ~np.isnan(w)
    predictions = np.full(NUM_MOVIES, np.nan)

    for i in range(NUM_MOVIES):
        if rated_mask[i]:
            continue
        sim_movies = np.where(~np.isnan(S[i,:]))[0]
        rated_and_similar = sim_movies[rated_mask[sim_movies]]
        if len(rated_and_similar) == 0:
            continue
        sim_vals = S[i, rated_and_similar]
        user_ratings = w[rated_and_similar]
        denom = np.nansum(sim_vals)
        if denom > 0:
            pred = np.nansum(sim_vals * user_ratings) / denom
            predictions[i] = pred
    
    non_na_preds = np.where(~np.isnan(predictions))[0]
    if len(non_na_preds) < top_n:
        # Not enough predictions, fallback to popularity
        sorted_preds_idx = non_na_preds[np.argsort(predictions[non_na_preds])[::-1]]
        top_already = set(sorted_preds_idx)
        needed = top_n - len(sorted_preds_idx)
        all_movies_pop_order = popularity_ranking.index
        rated_movies = set(np.where(rated_mask)[0])
        candidate_pop = [int(x[1:])-1 for x in all_movies_pop_order
                         if (int(x[1:])-1 not in rated_movies and int(x[1:])-1 not in top_already)]
        fill_movies = candidate_pop[:needed]
        final_indices = list(sorted_preds_idx) + fill_movies
    else:
        sorted_preds_idx = non_na_preds[np.argsort(predictions[non_na_preds])[::-1]]
        final_indices = sorted_preds_idx[:top_n]

    recommended = ["m"+str(i+1) for i in final_indices]
    return recommended


### Testing myIBCF

In [21]:
test_user_vector = R_df.loc["u1181"].values
recommended_user1181 = myIBCF(test_user_vector, S, R_df, popularity_ranking)
print("Top 10 recommendations for user u1181:", recommended_user1181)

In [22]:
NUM_MOVIES = R_df.shape[1]
hypo_user = np.full(NUM_MOVIES, np.nan)
hypo_user[1612] = 5.0  # m1613 index is 1612
hypo_user[1754] = 4.0  # m1755 index is 1754

recommended_hypo = myIBCF(hypo_user, S, R_df, popularity_ranking)
print("Top 10 recommendations for the hypothetical user:", recommended_hypo)

In [28]:
user_rating = rating_matrix.loc["u1181"].copy()
user_rating
print(myIBCF(S, user_rating))