In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel('./UUCF Assignment Spreadsheet.xls',sheetname=0)

In [3]:
uu_sim = data.corr()

In [4]:
def get_top_users(uu_sim,target,n=5):
    target_cor = uu_sim.loc[target]
    top_neighbors = target_cor.nlargest(n+1).iloc[1:]
    return top_neighbors

# Part 1 - Without Normalization

In [5]:
def get_user_movie_score(movie,user):
    neighbors = get_top_users(uu_sim,user)
    rating_sum = 0
    weight_sum = 0
    for user,w in zip(neighbors.index,neighbors.values):
        if np.isnan(movie[user]):
            continue
        rating_sum += movie[user] * w
        weight_sum += w
    if weight_sum == 0:
        return 0
    else:
        return rating_sum/weight_sum

In [6]:
pred_3867 = data.apply(get_user_movie_score,axis=1,args=(3867,))
pred_89 = data.apply(get_user_movie_score,axis=1,args=(89,))

In [7]:
pred_3867.sort_values(ascending=False)[:3]

1891: Star Wars: Episode V - The Empire Strikes Back (1980)    4.760291
155: The Dark Knight (2008)                                    4.551454
122: The Lord of the Rings: The Return of the King (2003)      4.507637
dtype: float64

In [8]:
pred_89.sort_values(ascending=False)[:3]

238: The Godfather (1972)               4.894124
278: The Shawshank Redemption (1994)    4.882194
807: Seven (a.k.a. Se7en) (1995)        4.774093
dtype: float64

# Part 2 - Normalization

In [9]:
def get_norm_user_movie_score(movie,user):
    neighbors = get_top_users(uu_sim,user)
    rating_sum = 0
    weight_sum = 0
    user_rating_mean = data.loc[:,user].mean()
    for user,w in zip(neighbors.index,neighbors.values):
        if np.isnan(movie[user]):
            continue
        movie_user_mean = data.loc[:,user].mean()
        rating_sum += (movie[user]-movie_user_mean) * w
        weight_sum += w
    if weight_sum == 0:
        return 0
    else:
        return user_rating_mean + rating_sum/weight_sum    

In [10]:
norm_pred_3867 = data.apply(get_norm_user_movie_score,axis=1,args=(3867,))
norm_pred_89 = data.apply(get_norm_user_movie_score,axis=1,args=(89,))

In [11]:
norm_pred_3867.sort_values(ascending=False)[:3]

1891: Star Wars: Episode V - The Empire Strikes Back (1980)    5.245509
155: The Dark Knight (2008)                                    4.856770
77: Memento (2000)                                             4.777803
dtype: float64

In [12]:
norm_pred_89.sort_values(ascending=False)[:3]

238: The Godfather (1972)               5.322015
278: The Shawshank Redemption (1994)    5.261424
275: Fargo (1996)                       5.241111
dtype: float64