In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix

In [2]:
def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

# Naive Bayes

In [3]:
df = load_movielens()
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
df_sparse = csc_matrix((ratings_list,(users_list,movies_list)))

In [73]:
def calculate_priors(df):
    m_ct = df.groupby('mid').count()['uid'].reset_index()
    mr_ct = df.groupby(['mid','rating']).count()['uid'].reset_index()
    total_ct = m_ct.merge(mr_ct,left_on='mid',right_on='mid')
    total_ct['prob'] = total_ct['uid_y']/total_ct['uid_x']
    return total_ct[['mid','rating','prob']]

def calculate_priors_sparse(df_sparse,laplacian = None):
    '''laplacian should be a floating point'''
    '''This takes a sparse matrix which has as columns the set of items to be rated'''
    '''The rows are each one user.'''
    '''Nonzero entries are ratings of items they have rated'''
    
    num_cols = df_sparse.shape[1]
    all_ratings = list(set(df_sparse.data))
    all_ratings = all_ratings
    all_ratings = sorted(all_ratings)
    num_ratings = len(all_ratings)
    ct_df = np.zeros((num_cols,num_ratings))

    for c in range(num_cols):
        #counts the number of unique ratings (per column) and how many times that rating occurred
        unq,cts = np.unique(df_sparse[:,c].toarray(),return_counts=True)
        d = dict(zip(unq, cts))
        d = {k:v for k,v in d.items() if k > 0}
        rating_dict = {rating:0 for rating in all_ratings}
        for k,v in d.items():
            rating_dict[k] = v
        #d = {k:v for k,v in d.items() if k!=0}
        ct_df[c] = [i[1] for i in sorted(rating_dict.items(),key=lambda x: x[0])]
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct_df)
    ct_df.columns = all_ratings
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)

    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * len(numerator_cols)
    ct_df = ct_df.apply(lambda x: x/x.rowsum,axis=1)
    ct_df = ct_df.drop('rowsum',axis=1)
    ct_df = ct_df.unstack().reset_index()
    ct_df.columns = ['rating','mid','prob']
    return ct_df

#Will need to fix this similarly to the above
def calculate_conditional(df,rating_vs,rating_k,j,k,laplacian = None):
    '''
    df is the sparse dataframe containing the data to be trained/rated
    rating_vs is the rating that we're comparing against for the conditional - 
        P(r_uk | r_uj = v_s), what's the likelihood that we observed rating r_uk for item k (user u) given that
        user u gave item j the rating v_s
    '''
    total_users = df.shape[0]
    df_j = df[:,j]
    #Find set of users who rated item j rating
    users = (df_j == rating_vs).nonzero()[0]
    num_users = len(users)
    #Of those users, how many rated item k the same as the original user?
    df_k = df_sparse[users,k]
    nz = df_k.nonzero()
    nzk = df_k[nz[0]].toarray()
    unq,cts = np.unique(nzk,return_counts=True)
    ct = []

    d = dict(zip(unq, cts))
    d = {k:v for k,v in d.items() if k!=0}
    ct.append(d)
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct)
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)

    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * total_users
    try:
        retval = (ct_df[rating_vs].values/ct_df['rowsum'].values)[0]
    except:
        retval = 1/total_users
    return retval
    
def calculate_rating(df,user,item,laplacian = None):
    '''
    df: sparse array (csc?) containing the data to be predicted
    user is index of user to look at
    item is item to be rated
    
    for all items that the user rated, we need to see, for each rating possible to use,
    what the likelihood is that user rated item, that rating. In order to do so, we use the formula:
    
    P(r_uj = v_s | Observed ratings of user u) is proportional to P(r_uj = v_s) * PI(P(r_uk | r_uj = v_s) for k in I_u)
    Where I_u is the set of items that user u has rated
    P(r_uj = vs) is called the prior probability, and we'll need to calculate that for each item first
    P(r_uk | r_uj = v_s) is called the conditional probability.
    '''
    
    #priors contains the prior probabilities for all items
    #for a given item, it's the list of all ratings and what ratio, 
    #for each rating for that item, what was the likelihood it occurred
    priors = calculate_priors_sparse(df,laplacian = laplacian)
    vals = list(set(df.data))
    num = 0
    denom = 0
    user_items = df[user,:].nonzero()[1]
    for i in range(len(vals)):
        vs = vals[i]
        try:
            prior = priors.loc[(priors['mid']==item)&(priors['rating']==vs),'prob'].values[0]
        except:
            prior = 0
            continue
        mult = 1
        for k in user_items:
            user_rating = df[user,k]
            conditional = calculate_conditional(df,vs,user_rating,item,k,laplacian = laplacian)
            mult = mult * conditional 
            if np.isnan(conditional or np.isnan(prior)):
                print('vs: %0.3f rating: %0.3f item: %i k: %i')
        num += vs*prior*mult
        denom += denom + prior * mult
    return num/denom

def print_user_movies(df,user):
    movies = df.loc[df['uid']==user,'title']
    print(movies)

In [100]:
test = [
    [1,-1,1,-1,1,-1],
    [1,1,0,-1,-1,-1],
    [0,1,1,-1,-1,0],
    [-1,-1,-1,1,1,1],
    [-1,0,-1,1,1,1]
]
test_array = np.array(test)
test_sparse = csc_matrix(test_array)

In [58]:
p = calculate_priors_sparse(df_sparse,laplacian=0.8)

In [63]:
r = calculate_rating(df_sparse,2,1,laplacian = 0.8)

In [85]:
r = calculate_rating(df_sparse,2,321,laplacian=0.8)

In [87]:
#Why is this coming out 0.25????
r

0.25000000045839021

In [80]:
df.loc[(df['uid']==2)&(df['mid']==321)]

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,uid,movie_mean_rating,title,genres,mid,adjusted_rating
4477,3,356,5.0,1298862167,3.568627,2,4.054252,Forrest Gump (1994),Comedy|Drama|Romance|War,321,1.431373


In [75]:
print_user_movies(df,2)

1780                                     Braveheart (1995)
3652                                   Pulp Fiction (1994)
4477                                   Forrest Gump (1994)
5418                                          Speed (1994)
6892                               Schindler's List (1993)
7718                                        Aladdin (1992)
8372                                         Batman (1989)
8568                      Silence of the Lambs, The (1991)
9011                    Indian in the Cupboard, The (1995)
9043                             Heavenly Creatures (1994)
9086                                    Major Payne (1995)
9099                      Shawshank Redemption, The (1994)
9410                               Flintstones, The (1994)
9449                           Beauty and the Beast (1991)
9625                                        Twister (1996)
9775                                  Trainspotting (1996)
9899                                          Bound (199

In [72]:
df[df['mid']==1]

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,uid,movie_mean_rating,title,genres,mid,adjusted_rating
41777,15,2,2.0,1134521380,2.621765,14,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-0.621765
41778,19,2,3.0,855194773,3.534279,18,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-0.534279
41779,30,2,2.0,945277634,3.765084,29,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-1.765084
41780,32,2,4.0,834828285,3.666667,31,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,0.333333
41781,41,2,3.5,1093888283,3.866834,40,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-0.366834
41782,47,2,5.0,832229657,3.868421,46,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,1.131579
41783,48,2,3.5,1322169967,3.514620,47,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-0.014620
41784,49,2,5.0,978040739,3.464646,48,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,1.535354
41785,61,2,3.5,1216051639,3.274096,60,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,0.225904
41786,68,2,3.0,1249809905,3.626016,67,3.401869,Jumanji (1995),Adventure|Children|Fantasy,1,-0.626016


In [67]:
df_sparse[2,:].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]),
 array([  56,  100,  219,  239,  266,  284,  320,  321,  341,  472,  521,
         524,  525,  527,  617,  642,  699,  954,  966,  990, 1025, 1118,
        1253, 1359, 1455, 1590, 1834, 2010, 2156, 2162, 2173, 2212, 2273,
        2288, 2374, 2599, 2804, 3157, 4081, 4255, 4604, 5017, 5116, 5467,
        5472, 5888, 6363, 6536, 6579, 6892, 7708]))

In [85]:
j = 0
rating_vs = 3
k = 1
laplacian = None

df_j = df_sparse[:,j]
#Find set of users who rated item j rating
users = (df_j == rating_vs).nonzero()[0]
num_users = len(users)
#Of those users, how many rated item k the same as the original user?
df_k = df_sparse[users,k]
nz = df_k.nonzero()
nzk = df_k[nz[0]].toarray()
unq,cts = np.unique(nzk,return_counts=True)
ct = []

d = dict(zip(unq, cts))
d = {k:v for k,v in d.items() if k!=0}
ct.append(d)
#This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
#which that particular rating occurred for that particular film
ct_df = pd.DataFrame(ct)
numerator_cols = ct_df.columns
ct_df['rowsum'] = ct_df.sum(axis=1)

if laplacian is not None:
    ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
    ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * len(numerator_cols)
(ct_df[rating_vs].values/ct_df['rowsum'].values)[0]


0.69230769230769229

In [66]:
ct_df[rating_vs]/ct_df['rowsum']

0    0.692308
dtype: float64

In [69]:
??calculate_conditional

In [87]:
calculate_conditional(df_sparse,3,3,0,1)

0.69230769230769229

In [56]:
ct_df

Unnamed: 0,rating,mid,prob
0,0.5,0,0.003137
1,0.5,1,0.006957
2,0.5,2,0.026866
3,0.5,3,0.038095
4,0.5,4,0.012500
5,0.5,5,0.007143
6,0.5,6,0.045902
7,0.5,7,0.061538
8,0.5,8,0.028571
9,0.5,9,0.006154


In [45]:
rating_dict

{0: 670,
 0.5: 0,
 1.0: 0,
 1.5: 0,
 2.0: 0,
 2.5: 0,
 3.0: 0,
 3.5: 0,
 4.0: 0,
 4.5: 0,
 5.0: 1}

In [44]:
ct_df[ct_df['mid'] == 0]

Unnamed: 0,rating,mid,prob
0,0,0,0.62489
9066,1,0,0.001177
18132,2,0,0.007061
27198,3,0,0.00559
36264,4,0,0.0203
45330,5,0,0.007061
54396,6,0,0.061489
63462,7,0,0.03501
72528,8,0,0.114445
81594,9,0,0.029126


In [12]:
np.unique(df_sparse.toarray(),return_counts=True,axis=1)

(array([[ 0.,  0.,  0., ...,  4.,  4.,  4.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  4.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64))