In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix, csr_matrix

In [2]:
def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

# Naive Bayes

In [5]:
df = load_movielens()
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
df_sparse = csr_matrix((ratings_list,(users_list,movies_list)))

In [8]:
def calculate_priors(df):
    m_ct = df.groupby('mid').count()['uid'].reset_index()
    mr_ct = df.groupby(['mid','rating']).count()['uid'].reset_index()
    total_ct = m_ct.merge(mr_ct,left_on='mid',right_on='mid')
    total_ct['prob'] = total_ct['uid_y']/total_ct['uid_x']
    return total_ct[['mid','rating','prob']]

def calculate_priors_sparse(df_sparse,laplacian = None):
    '''laplacian should be a floating point'''
    '''This takes a sparse matrix which has as columns the set of items to be rated'''
    '''The rows are each one user.'''
    '''Nonzero entries are ratings of items they have rated'''
    
    num_cols = df_sparse.shape[1]
    all_ratings = list(set(df_sparse.data))
    all_ratings = all_ratings
    all_ratings = sorted(all_ratings)
    num_ratings = len(all_ratings)
    ct_df = np.zeros((num_cols,num_ratings))

    for c in range(num_cols):
        #counts the number of unique ratings (per column) and how many times that rating occurred
        unq,cts = np.unique(df_sparse[:,c].toarray(),return_counts=True)
        d = dict(zip(unq, cts))
        d = {k:v for k,v in d.items() if k > 0}
        rating_dict = {rating:0 for rating in all_ratings}
        for k,v in d.items():
            rating_dict[k] = v
        #d = {k:v for k,v in d.items() if k!=0}
        ct_df[c] = [i[1] for i in sorted(rating_dict.items(),key=lambda x: x[0])]
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct_df)
    ct_df.columns = all_ratings
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)

    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * len(numerator_cols)
    ct_df = ct_df.apply(lambda x: x/x.rowsum,axis=1)
    ct_df = ct_df.drop('rowsum',axis=1)
    ct_df = ct_df.unstack().reset_index()
    ct_df.columns = ['rating','mid','prob']
    return ct_df

#Will need to fix this similarly to the above
def calculate_conditional(df,rating_vs,rating_k,j,k,laplacian = None):
    '''
    df is the sparse dataframe containing the data to be trained/rated
    rating_vs is the rating that we're comparing against for the conditional - 
        P(r_uk | r_uj = v_s), what's the likelihood that we observed rating r_uk for item k (user u) given that
        user u gave item j the rating v_s
    '''
    total_users = df.shape[0]
    df_j = df[:,j]
    #Find set of users who rated item j rating
    users = (df_j == rating_vs).nonzero()[0]
    num_users = len(users)
    #Of those users, how many rated item k the same as the original user?
    df_k = df_sparse[users,k]
    nz = df_k.nonzero()
    nzk = df_k[nz[0]].toarray()
    unq,cts = np.unique(nzk,return_counts=True)
    ct = []

    d = dict(zip(unq, cts))
    d = {k:v for k,v in d.items() if k!=0}
    ct.append(d)
    #This dataframe now has rows which are films, columns which are ratings, and each entry has the number of times
    #which that particular rating occurred for that particular film
    ct_df = pd.DataFrame(ct)
    numerator_cols = ct_df.columns
    ct_df['rowsum'] = ct_df.sum(axis=1)

    if laplacian is not None:
        ct_df[numerator_cols] = ct_df[numerator_cols].fillna(0) + laplacian
        ct_df['rowsum'] = ct_df['rowsum'].fillna(0) + laplacian * total_users
    try:
        retval = (ct_df[rating_vs].values/ct_df['rowsum'].values)[0]
    except:
        retval = 1/total_users
    return retval
    
def calculate_rating(df,user,item,laplacian = None):
    '''
    df: sparse array (csc?) containing the data to be predicted
    user is index of user to look at
    item is item to be rated
    
    for all items that the user rated, we need to see, for each rating possible to use,
    what the likelihood is that user rated item, that rating. In order to do so, we use the formula:
    
    P(r_uj = v_s | Observed ratings of user u) is proportional to P(r_uj = v_s) * PI(P(r_uk | r_uj = v_s) for k in I_u)
    Where I_u is the set of items that user u has rated
    P(r_uj = vs) is called the prior probability, and we'll need to calculate that for each item first
    P(r_uk | r_uj = v_s) is called the conditional probability.
    '''
    
    #priors contains the prior probabilities for all items
    #for a given item, it's the list of all ratings and what ratio, 
    #for each rating for that item, what was the likelihood it occurred
    priors = calculate_priors_sparse(df,laplacian = laplacian)
    vals = list(set(df.data))
    num = 0
    denom = 0
    user_items = df[user,:].nonzero()[1]
    for i in range(len(vals)):
        vs = vals[i]
        try:
            prior = priors.loc[(priors['mid']==item)&(priors['rating']==vs),'prob'].values[0]
        except:
            prior = 0
            continue
        mult = 1
        for k in user_items:
            user_rating = df[user,k]
            conditional = calculate_conditional(df,vs,user_rating,item,k,laplacian = laplacian)
            mult = mult * conditional 
            if np.isnan(conditional or np.isnan(prior)):
                print('vs: %0.3f rating: %0.3f item: %i k: %i')
        num += vs*prior*mult
        denom += prior * mult
    return num/denom

def print_user_movies(df,user):
    movies = df.loc[df['uid']==user,'title']
    print(movies)

In [100]:
test = [
    [1,-1,1,-1,1,-1],
    [1,1,0,-1,-1,-1],
    [0,1,1,-1,-1,0],
    [-1,-1,-1,1,1,1],
    [-1,0,-1,1,1,1]
]
test_array = np.array(test)
test_sparse = csc_matrix(test_array)

In [134]:
p = calculate_priors_sparse(df_sparse,laplacian=0.8)

In [135]:
r = calculate_rating(df_sparse,2,1,laplacian = 0.8)
print(r)

3.98558709162


In [136]:
r = calculate_rating(df_sparse,2,321,laplacian=0.8)
print(r)

4.00000000244


In [137]:
r = calculate_rating(df_sparse,2,595,laplacian=0.8)
print(r)

2.99619678703


In [20]:
r = calculate_rating(df_sparse,580,1,laplacian=0.8)
print(r)

3.66251781122


# Latent Factor Models

In [79]:
def construct_factors(df_sparse,num_factors):
    '''Constructs the num_factors user factors and item factors for the sparse matrix'''
    m = df_sparse.shape[0]
    n = df_sparse.shape[1]
    U = np.random.rand(m,num_factors)
    V = np.random.rand(n,num_factors)
    return U,V

def error(df_sparse,U,V):
    '''Calculates the matrix portion of the error term'''
    d = df_sparse.nonzero()
    rows = d[0].reshape(1,-1).flatten()
    cols = d[1].reshape(1,-1).flatten()
    e = df_sparse[d] - np.matmul(U,V.T)[d]
    e = np.asarray(e).flatten()
    e = csc_matrix((e,(rows,cols)))
    return e

def frobenius_norm(df_sparse,U,V):
    e = error(df_sparse,U,V)
    e = e.power(2)
    return e.sum()

def cost_function(df_sparse,U,V,l):
    '''Calculates the cost function to minimize, l is the regularization term'''
    e = frobenius_norm(df_sparse,U,V)
    usum = np.square(U).sum()
    vsum = np.square(V).sum()
    J = 0.5 * e + (l/2)*usum + (l/2)*vsum
    return J

def update_factors(df_sparse,U,V,l,alpha):
    '''alpha is learning rate, l is regularization parameter'''
    nz = df_sparse.nonzero()
    E = error(df_sparse,U,V)
    U_temp = U*(1-alpha * l) + alpha * (E * V)
    V_temp = V*(1-alpha * l) + alpha * (E.T * U)
    return U_temp,V_temp

def fit(df_sparse,num_factors,learning_rate,regularization_rate):
    U,V = construct_factors(df_sparse,num_factors)
    J = cost_function(df_sparse,U,V,regularization_rate)
    prev = J
    iter_ctr = 0
    while True:
        iter_ctr += 1
        U,V = update_factors(df_sparse,U,V,regularization_rate,learning_rate)
        J = cost_function(df_sparse,U,V,regularization_rate)
        pct_change = J/prev-1
        if abs(pct_change) < 0.001 or iter_ctr > 1000:
            break
        prev = J
    return U,V
    

In [89]:
U,V = fit(df_sparse,10,0.0001,0.8)
pred_rat = np.matmul(U,V.T)

In [81]:
e = error(df_sparse,U,V)

In [82]:
print(df_sparse[0,30])
print(e[0,30])
print(pred_rat[0,30])

2.5
0.0479521599001
2.4520478401


In [83]:
cost_function(df_sparse,U,V,l)

54999.428332574898

In [84]:
e = error(df_sparse,U,V)