In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix

In [2]:
def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

# Naive Bayes

In [3]:
df = load_movielens()
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
df_sparse = csc_matrix((ratings_list,(users_list,movies_list)))

In [113]:
def calculate_priors(df):
    m_ct = df.groupby('mid').count()['uid'].reset_index()
    mr_ct = df.groupby(['mid','rating']).count()['uid'].reset_index()
    total_ct = m_ct.merge(mr_ct,left_on='mid',right_on='mid')
    total_ct['prob'] = total_ct['uid_y']/total_ct['uid_x']
    return total_ct[['mid','rating','prob']]


#TODO - Need to calculate the laplacian smoothed priors with ALL values present, not just items which were rated
#Probably need to calculate the number of unique ratings that can be used (used_ratings)
#then make a len(used_ratings) x num_cols zero filled matrix and put the unique counts in there
def calculate_priors_sparse(df_sparse,laplacian = None):
    '''laplacian should be a floating point'''
    '''This takes a sparse matrix which has as columns the set of items to be rated'''
    '''The rows are each one user.'''
    '''Nonzero entries are ratings of items they have rated'''
    
    num_cols = df_sparse.shape[1]
    ct = []
    for c in range(num_cols):
        #counts the number of unique ratings (per column) and how many times that rating occurred
        unq,cts = np.unique(df_sparse[:,c].toarray(),return_counts=True)
        d = dict(zip(unq, cts))
        d = {k:v for k,v in d.items() if k!=0}
        ct.append(d)
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct)
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)
    
    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * len(numerator_cols)
    ct_df = ct_df.apply(lambda x: x/x.rowsum,axis=1)
    ct_df = ct_df.drop('rowsum',axis=1)
    ct_df = ct_df.unstack().reset_index()
    ct_df.columns = ['rating','mid','prob']
    return ct_df

#Will need to fix this similarly to the above
def calculate_conditional(df,rating_vs,rating_k,j,k,laplacian = None):
    '''
    df is the sparse dataframe containing the data to be trained/rated
    rating_vs is the rating that we're comparing against for the conditional - 
        P(r_uk | r_uj = v_s), what's the likelihood that we observed rating r_uk for item k (user u) given that
        user u gave item j the rating v_s
    '''
    df_j = df[:,j]
    #Find set of users who rated item j rating
    users = (df_j == rating_vs).nonzero()[0]
    num_users = len(users)
    #Of those users, how many rated item k the same as the original user?
    df_k = df_sparse[users,k]
    nz = df_k.nonzero()
    nzk = df_k[nz[0]].toarray()
    unq,cts = np.unique(nzk,return_counts=True)
    ct = []

    d = dict(zip(unq, cts))
    d = {k:v for k,v in d.items() if k!=0}
    ct.append(d)
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct)
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)

    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * num_users
    try:
        retval = (ct_df[rating_vs].values/ct_df['rowsum'].values)[0]
    except:
        retval = 1/num_users
    return retval
    
def calculate_rating(df,user,item,laplacian = None):
    '''
    df: sparse array (csc?) containing the data to be predicted
    user is index of user to look at
    item is item to be rated
    
    for all items that the user rated, we need to see, for each rating possible to use,
    what the likelihood is that user rated item, that rating. In order to do so, we use the formula:
    
    P(r_uj = v_s | Observed ratings of user u) is proportional to P(r_uj = v_s) * PI(P(r_uk | r_uj = v_s) for k in I_u)
    Where I_u is the set of items that user u has rated
    P(r_uj = vs) is called the prior probability, and we'll need to calculate that for each item first
    P(r_uk | r_uj = v_s) is called the conditional probability.
    '''
    
    #priors contains the prior probabilities for all items
    #for a given item, it's the list of all ratings and what ratio, 
    #for each rating for that item, what was the likelihood it occurred
    priors = calculate_priors_sparse(df,laplacian = laplacian)
    vals = list(set(df.data))
    num = 0
    denom = 0
    user_items = df[user,:].nonzero()[1]
    for i in range(len(vals)):
        vs = vals[i]
        try:
            prior = priors.loc[(priors['mid']==item)&(priors['rating']==vs),'prob'].values[0]
        except:
            prior = 0
            continue
        mult = 1
        for k in user_items:
            user_rating = df[user,k]
            conditional = calculate_conditional(df,vs,user_rating,item,k,laplacian = laplacian)
            mult = mult * conditional 
            if np.isnan(conditional or np.isnan(prior)):
                print('vs: %0.3f rating: %0.3f item: %i k: %i')
        num += vs*prior*mult
        denom += denom + prior * mult
    return num/denom

In [100]:
test = [
    [1,-1,1,-1,1,-1],
    [1,1,0,-1,-1,-1],
    [0,1,1,-1,-1,0],
    [-1,-1,-1,1,1,1],
    [-1,0,-1,1,1,1]
]
test_array = np.array(test)
test_sparse = csc_matrix(test_array)

In [None]:
p = calculate_priors_sparse(df_sparse,laplacian=0.8)

In [111]:
r = calculate_rating(df_sparse,2,1,laplacian = 0.8)

ZeroDivisionError: division by zero

In [85]:
j = 0
rating_vs = 3
k = 1
laplacian = None

df_j = df_sparse[:,j]
#Find set of users who rated item j rating
users = (df_j == rating_vs).nonzero()[0]
num_users = len(users)
#Of those users, how many rated item k the same as the original user?
df_k = df_sparse[users,k]
nz = df_k.nonzero()
nzk = df_k[nz[0]].toarray()
unq,cts = np.unique(nzk,return_counts=True)
ct = []

d = dict(zip(unq, cts))
d = {k:v for k,v in d.items() if k!=0}
ct.append(d)
#This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
#which that particular rating occurred for that particular film
ct_df = pd.DataFrame(ct)
numerator_cols = ct_df.columns
ct_df['rowsum'] = ct_df.sum(axis=1)

if laplacian is not None:
    ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
    ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * len(numerator_cols)
(ct_df[rating_vs].values/ct_df['rowsum'].values)[0]


0.69230769230769229

In [66]:
ct_df[rating_vs]/ct_df['rowsum']

0    0.692308
dtype: float64

In [69]:
??calculate_conditional

In [87]:
calculate_conditional(df_sparse,3,3,0,1)

0.69230769230769229