## PROJECT 4
Fall 2023 | STAT-542 / CS-598

by Baolong Truong (baolong3), Robbie Li (robbiel2), Wesley Ecoiffier (wesleye2)

In [1]:
# Define imports and set options
import pandas as pd
import numpy as np
import requests
import warnings
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option("display.float_format", "{:.7f}".format)
warnings.filterwarnings("ignore")

## Preprocessing
1. Fetch required data
2. Clean and format data movie data
3. Merge movie and ratings data

In [3]:
# Get movies list and create movies DataFrame

ratings = pd.read_csv('data/ratings.csv')
movies_list_url = "https://liangfgithub.github.io/MovieData/movies.dat?raw=true"

# Fetch the data from the URL
movies_list = requests.get(movies_list_url)

# Split the data into lines and then split each line using "::"
movie_lines = movies_list.text.split('\n')
movie_data = [line.split("::") for line in movie_lines if line]

# Create a DataFrame from the movie data
movies = pd.DataFrame(movie_data, columns=['movie_id', 'title', 'genres'])
movies['movie_id'] = movies['movie_id'].astype(int)

In [4]:
# Get ratings data and merge with movies list
ratings_count = ratings.count(axis=0)
ratings_count = pd.DataFrame({'count': ratings_count})
ratings_count['movie_id'] = ratings_count.index
ratings_count['movie_id'] = ratings_count['movie_id'].apply(lambda x: int(x[1:]))
ratings_count = ratings_count.reset_index()
merged = pd.merge(movies, ratings_count, on='movie_id', how='inner')
merged = merged.drop(columns='index')

# Output the merged dataset to a CSV file, excluding index column
merged.to_csv('data/movies_with_ratings_count.csv', index=False)

ValueError: invalid literal for int() with base 10: 'nnamed: 3706'

In [None]:
# Create a list of all the genre options

# Filter out None values
genres = merged["genres"].str.split("|", expand=True)

# Only keep uniques
genres = genres.stack().unique()

# Sort the list alphabetically
genres = np.sort(genres)

# Output to csv, excluding index column and column headers
genres_df = pd.DataFrame(genres, columns=["Genre"])
genres_df.to_csv("data/genres.csv", index=False, header=False)

## System I: Recommendatings by Genre

Get movie recommendations based on a specified genre, ranked by number of ratings. 

In [None]:
def top_movies_in_genre(df, genre, n = 10):
    # Filter DataFrame for rows with the specified genre
    genre_df = df[df['genres'].str.contains(genre)]

    # Sort DataFrame by count in descending order
    sorted_genre_df = genre_df.sort_values(by='count', ascending=False)

    # Take the top n rows
    top_movies = sorted_genre_df.head(n)

    return top_movies

In [None]:
# Example usage
top_movies_in_genre(merged, "Comedy", 20)

# System 2: Item-Based Collaborative Filtering (IBCF)

In [None]:
# Normalize the ratings data by centering it around the mean rating for each movie
row_means = np.nanmean(ratings, axis=1, keepdims=True)

# Create a matrix where each row mean is repeated along the columns
row_means_matrix = np.tile(row_means, (1, ratings.shape[1]))

# Subtract the row means matrix from the original matrix
R = ratings - row_means_matrix

# Ouput normalized ratings to CSV
R.to_csv('data/ratings_norm.csv')

## !!! IMPORTANT !!!

R is much faster than python at creating the similarity matrix using cosine similarity. Therefore, we will use R to create the similarity matrix, then resume the rest of the system 2 data processing in python.

The `similarity.Rmd` script expects that `data/ratings_norm.csv` exists (which is written in the block above). It will output the `similarity.csv` file, which is used in subsequent steps.

In [None]:
# Create a mask
# Note: this only needs to be run once on ratings.csv.

try:
    # Check if data/mask.csv exists. If it does, load it
    mask = pd.read_csv("data/mask.csv")
except:
    # If it doesn't, create it
    not_na = ratings.notna().astype(int)
    ratings_mask = not_na.dot(not_na.T)
    ratings_mask.to_csv("data/mask.csv")

# Check similarity matrix

In [None]:
S = pd.read_csv('data/similarity.csv')
S.set_index('Unnamed: 0', inplace= True)

mask.set_index("Unnamed: 0", inplace= True)
np.fill_diagonal(mask.values, 0)

# For all movies with less than 3 ratings, set the similarity to nan
S[mask < 3] = np.nan

In [None]:
# Test similarity output of selected movies, rounded to 7 decimal places
selected_indices = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']

sample_similarity = S.loc[selected_indices, selected_indices]
sample_similarity.round(7)

sample_similarity

In [None]:
# Create a function to get the top n (default 30) similar movies for each movie
def top_n(row, n = 30):
    top_n_indices = row.sort_values(ascending=False).index[:n]
    row.loc[~row.index.isin(top_n_indices)] = np.nan
    return row

S = S.apply(top_n, axis=1)
S.to_csv('data/similarity_top_30.csv')

In [None]:
S_top30 = pd.read_csv('data/similarity_top_30.csv')
S_top30.set_index("Unnamed: 0", inplace= True)

In [None]:
def myIBCF(similarity_matrix, newuser):
    not_rated_indices = []
    for index in newuser.index:
        if np.isnan(newuser[index]):
            not_rated_indices.append(index)
    df_not_rated = pd.DataFrame(index=not_rated_indices, columns=["Value"])

    for l in df_not_rated.index:
        Sl = S_top30.loc[l].dropna()

        movie_score_num = 0
        movie_score_denom = 0
        w = newuser

        for i in Sl.index:
            w_i = 0 if np.isnan(w[i]) else w[i]
            movie_score_num += w_i * Sl[i]
            if w_i != 0:
                movie_score_denom += Sl[i]
        if movie_score_denom != 0:
            df_not_rated.loc[l] = movie_score_num / movie_score_denom

    return df_not_rated.sort_values(by="Value", ascending=False).head(10)

# Top recommendations for user u1181

In [None]:
# Test myIBCF function with user u1181
print("Top Recommendations for User u1181")
newuser = ratings.loc['u1181']
myIBCF(S_top30, newuser)

# Top recommendations for user u1181


In [None]:
# Test myIBCF function with user u1351
print("Top Recommendations for User u1351")
newuser = ratings.loc['u1351']
myIBCF(S_top30, newuser)

# Top recommendations for user John, who rates movie "m1613" a 5 and movie "m1755" a 4


In [None]:
# Test myIBCF function with user u1351
print("Top Recommendations for User John")
newuser = ratings.loc['John']
myIBCF(S_top30, newuser)