# Project 4: Movie Recommendation System

Name: Rustom Ichhaporia

netID: rustomi2

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('data/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

## System I

For System I, I will implement a Bayesian approach to the recommendation problem. Rather than taking the plain average rating for each movie in the desired genre, I will add a Bayesian term to the computation of the average, so that movies with very few ratings will be pulled towards the overall average rating for the genre. The Bayesian term will be a weighted average of the overall average rating for the genre and the overall average rating for all movies. The weight will be the number of ratings for the movie divided by the number of ratings for all movies in the genre. We begin by assuming every movie already has W ratings with an average of R score, and then add the real data to those numbers and recompute the average. I have chosen to set W=10 and R=3.5 based on the distribution of the data. 

In [2]:
# Make genre map of genre to movie id
genre_map = {}
for index, row in movies.iterrows():
    genres = row['Genres'].split('|')
    for genre in genres:
        if genre not in genre_map:
            genre_map[genre] = []
        genre_map[genre].append(row['MovieID'])

# Convert genre column to one-hot encoding
for genre in genre_map:
    movies[genre] = movies['MovieID'].apply(lambda x: 1 if x in genre_map[genre] else 0)

In [3]:
def get_recommendations_simple(genre, num_results=10): 
    W, R = 10.0, 3.5
    movieIDs = genre_map[genre]
    ratings_subset = ratings[ratings['MovieID'].isin(movieIDs)]
    # Recompute movie's average rating with W and R Bayesian smoothing
    ratings_subset = (ratings_subset.groupby('MovieID').sum() + (W * R)) / (ratings_subset.groupby('MovieID').count() + W)
    ratings_subset = ratings_subset.sort_values(by='Rating', ascending=False)
    return ratings_subset.head(num_results)

In [4]:
genre_recommendations = pd.DataFrame()
for genre in genre_map:
    # add the top 10 recommendations for each genre to a dataframe with column as genre and row as recommendation
    genre_recommendations[genre] = get_recommendations_simple(genre)['Rating'].index

genre_recommendations.to_pickle('genre_recommendations.pkl')

In [5]:
get_recommendations_simple('Film-Noir')

Unnamed: 0_level_0,UserID,Rating,Timestamp
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
922,3016.427083,4.470833,952350100.0
3435,2994.156863,4.399287,954869600.0
913,3183.615385,4.387464,961833800.0
1252,3130.421757,4.332218,962893000.0
1267,3128.656774,4.322581,959533800.0
1284,2979.350272,4.297641,953759300.0
2186,3045.380081,4.288618,951762200.0
930,3021.916484,4.276923,951276000.0
1260,3050.069182,4.27673,941259300.0
541,3108.017127,4.269061,965633500.0


## System II

In [6]:
R = pd.read_csv('data/R_matrix.csv')
R_orig = R.copy()
R = R.sub(R.mean(axis=1, skipna=True), axis=0)
R = R.to_numpy()
S = np.zeros((R.shape[1], R.shape[1]))
S.fill(np.nan)

First we create the initial similarities matrix as described in the instructions. This is the symmetric version. 

In [7]:
for row in range(S.shape[0]):
    for col in range(row + 1, S.shape[0]):
        overlap = np.where(~(np.isnan(R[:, row]) | np.isnan(R[:, col])))[0]

        if len(overlap) < 3:
            continue

        total = np.sum(R[:, row][overlap] * R[:, col][overlap])
        divisor = np.linalg.norm(R[:, row][overlap], ord=2) * np.linalg.norm(R[:, col][overlap], ord=2)
        
        score = 0.5 + (0.5 * (total / divisor))
        S[row, col] = score
        S[col, row] = score

similarities = pd.DataFrame(S, index=R_orig.columns, columns=R_orig.columns)

sample = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']
similarities_test = similarities.loc[sample, sample]
similarities_test = np.round(similarities_test, 7)

similarities_test

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.512105,0.392,,0.741148,
m10,0.512105,,0.547458,,0.534334,
m100,0.392,0.547458,,,0.329694,
m1510,,,,,,
m260,0.741148,0.534334,0.329694,,,
m3212,,,,,,


Next, we create the non-symmetric, reduced version of S. We also define a list of the top 20 movies using the Bayesian approach from System I. These are the substitute movies that are used if there are insufficient results in the myIBCF function.

In [19]:
reduced = S.copy()

for row in range(S.shape[0]):
    empty_count = np.isnan(reduced[row]).sum()
    reduced[row][np.argsort(reduced[row])[:-empty_count - 30]] = np.nan

S_reduced = pd.DataFrame(reduced, index=R_orig.columns, columns=R_orig.columns)

substitutes = pd.Series(np.zeros(S.shape[0]), index=R_orig.columns).rename("Bayesian_Rating")

for movie in R_orig.columns:
    ratings_subset = R_orig[movie]
    W, R = 10.0, 3.5
    bayesian_rating = (ratings_subset.sum() + (W * R)) / (ratings_subset.count() + W)
    substitutes[movie] = bayesian_rating
        
bayesian_substitutes = substitutes.sort_values(ascending=False)[:20]

In [20]:
def myIBCF(newuser):
    rating_indices = np.argwhere(~np.isnan(newuser))

    output = S_reduced.iloc[:, 0] * np.NaN

    for i in range(R_orig.shape[1]):
        movie = S_reduced.iloc[i]
        neighbors = np.argwhere(~np.isnan(movie).to_numpy())
        neighbors_vals = np.intersect1d(rating_indices, neighbors)

        if not len(neighbors_vals):
            continue

        output[i] = np.sum(movie[neighbors_vals] * newuser[neighbors_vals]) / np.sum(movie[neighbors_vals])

    output = output[np.isnan(newuser) & (np.abs(output - 0.1) > 0.0001)]
    output = list(output.sort_values(ascending=False).index)

    # Add Bayesian results if not enough results
    while len(output) < 10:
        for movie in bayesian_substitutes:
            if movie not in output:
                output.append(movie)

    return output[:10]

In [21]:
R_orig.loc['u1181']

m1       3.0
m10      4.0
m100     NaN
m1000    NaN
m1002    NaN
        ... 
m994     4.0
m996     2.0
m997     3.0
m998     NaN
m999     2.0
Name: u1181, Length: 3706, dtype: float64

In [22]:
myIBCF(R_orig.loc["u1181"].to_numpy())

['m3732',
 'm749',
 'm3899',
 'm427',
 'm1039',
 'm3752',
 'm3789',
 'm1734',
 'm2793',
 'm504']

In [12]:
myIBCF(R_orig.loc["u1351"].to_numpy())

['m853',
 'm1780',
 'm2127',
 'm1871',
 'm2061',
 'm1514',
 'm1901',
 'm985',
 'm3012',
 'm1659']

In [23]:
hypothetical_user = S_reduced.iloc[0].copy()
hypothetical_user[:] = np.nan
hypothetical_user["m1613"] = 5
hypothetical_user["m1755"] = 4

myIBCF(hypothetical_user.to_numpy())

['m1017',
 'm765',
 'm74',
 'm3269',
 'm2846',
 'm340',
 'm338',
 'm3258',
 'm3254',
 'm316']

In [14]:
S_reduced.to_pickle('S_reduced.pkl')

In [15]:
bayesian_substitutes.to_pickle('bayesian_substitutes.pkl')