In [1]:
import numpy as np
from numpy.random import choice
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix, csr_matrix, lil_matrix

def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

def test_sparse(sparse_matrix):
    n = sparse_matrix.nnz
    s = sparse_matrix.shape
    return 1-n/(s[0]*s[1])

def train_test_split(sparse_matrix,pct):
    '''Splits a sparse matrix into two sets - a train set and a test set'''
    nz_coord = list(zip(*sparse_matrix.nonzero()))
    nz_coord_array = np.array(nz_coord)
    num_rand = int(np.floor(len(nz_coord)*(pct/100.0)))
    n = len(nz_coord)
    c = choice(n,size=num_rand,replace=False)
    rows = nz_coord_array[c][:,0]
    cols = nz_coord_array[c][:,1]
    vals = np.array(sparse_matrix[rows,cols]).flatten()
    train = sparse_matrix.copy()
    test = lil_matrix(train.shape)
    test[rows,cols] = vals
    test = test.tocsr()
    train[rows,cols] = 0
    train.eliminate_zeros()
    return train,test

df = load_movielens()
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
df_sparse = csr_matrix((ratings_list,(users_list,movies_list)))
train,test = train_test_split(df_sparse,20)


In [2]:
movie_xref = df[['mid','title','genres']]
movie_xref = movie_xref.drop_duplicates()

# Bagging

In [3]:
import implicit

In [4]:
def bootstrap(sparse_matrix):
    '''
        Row-wise bootstrapping of the sparse matrix
        Returns the sampled rows as after CF we'll need to average out ratings over any 
        duplicated users
    '''
    rows = sparse_matrix.shape[0]
    rows_to_sample = np.random.choice(rows,rows)
    bootstrapped_matrix = sparse_matrix[rows_to_sample,:]
    return bootstrapped_matrix, rows_to_sample

In [55]:
user_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
ratings_list_implicit = [1 for i in ratings_list]
new_sparse = csr_matrix((ratings_list,(user_list,movies_list)))
#1 or 0 each value
new_sparse = new_sparse.sign()
train,test = train_test_split(new_sparse,20)

In [56]:
bs_train, bs_rows = bootstrap(train)

In [57]:
bs_train

<671x9066 sparse matrix of type '<class 'numpy.float64'>'
	with 78819 stored elements in Compressed Sparse Row format>

In [58]:
def user_rated_movies(df,sparse_matrix,uid,cluster=False):
    '''uid can be cluster or user id'''
    mid = sparse_matrix[uid].nonzero()[1]
    if cluster:
        user_string = 'cluster'
    else:
        user_string = 'uid'    
    df_rows = df.loc[(df[user_string]==uid)&(df['mid'].isin(train[uid].nonzero()[1])),:]
    gb = df_rows.groupby('title').agg(['count','mean'])['rating']
    gb = gb.sort_values(by='mean',ascending=False)
    print(gb[:10])
    
def train_model(df,train):
    factors = 20
    regularization = 0.1
    iterations = 50
    model = implicit.als.AlternatingLeastSquares(factors=factors,regularization=regularization,iterations=iterations)

    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(train.T)
    return model

def bootstrap_to_user_item_matrix(bootstrap_model, bootstrap_rows):
    '''
    Each bootstrapped fit model will have some duplicate rows in it
    This function will recreate a user-item matrix so that there are no duplicates
    '''
    new_pred_matrix = np.zeros((m.user_factors.shape[0],m.item_factors.shape[0]))
    user_f = bootstrap_model.user_factors
    item_f = bootstrap_model.item_factors
    pred_matrix = item_f.dot(user_f.T).T
    for i in range(new_pred_matrix.shape[0]):
        bs_dups = np.where(bootstrap_rows==i)[0]
        if len(bs_dups) > 0:
            model_pred = pred_matrix[bs_dups].mean(axis=0)
            new_pred_matrix[i] = model_pred
    return new_pred_matrix
        

def bag_recommendations(df, bootstraps_models, bootstraps_rows,shape):
    '''
        df is original dataframe
        bootstraps is list of bootstrapped matrices
        bootstraps_rows is list of rows sampled for each bootstrap
    '''
    bagged_model = np.zeros(shape)
    user_ct_for_avg = np.zeros(shape[0])
    for i in range(len(bootstraps_models)):
        bs_model = bootstraps_models[i]
        bs_rows = bootstraps_rows[i]
        pred_matrix = bootstrap_to_user_item_matrix(bs_model,bs_rows)
        bagged_model = np.add(bagged_model,pred_matrix)
        users_for_this_model = np.unique(pred_matrix.nonzero()[0])
        user_ct_for_avg[users_for_this_model] += 1
    return bagged_model/user_ct_for_avg[:,None]
    
def recommend_movies_bootstrap(df,prediction_matrix,uid):
    '''Predict what a user (or cluster) would like based on trained model'''
    top_10 = prediction_matrix[uid].argsort()[-10:][::-1]
    movie_xref = df[['mid','title','genres']]
    movie_xref = movie_xref.drop_duplicates()
    print(movie_xref[movie_xref['mid'].isin(top_10)])

In [59]:
#Need to figure out the text from Aggarwal:

##For each training data set, an item rating can be predicted
##for a user only if that user is represented at least once in the matrix. In such a case,
##the predicted rating from that ensemble component is the average rating of that item
##over the duplicate occurrences of that user. 

In [60]:
new_pred_matrix.shape

NameError: name 'new_pred_matrix' is not defined

In [61]:
m = train_model(df,bs_train)



In [62]:
np.where(bs_rows==9000)[0]

array([], dtype=int64)

In [63]:
user_f = m.user_factors
item_f = m.item_factors
pred_matrix = item_f.dot(user_f.T).T

In [64]:
b = bootstrap_to_user_item_matrix(m,bs_rows)

In [65]:
user_ct_for_avg = np.zeros(671)

In [66]:
users_for_this_model = np.unique(b.nonzero()[0])

In [67]:
user_ct_for_avg[users_for_this_model] += 1

In [68]:
models = []
rows = []
for i in range(5):
    bs_train, bs_rows = bootstrap(train)
    m = train_model(df,bs_train)
    models.append(m)
    rows.append(bs_rows)



In [107]:
pred_matrix.shape

(671, 9066)

In [69]:
shape = (671, 9066)
pred = bag_recommendations(df,models,rows,shape)



In [70]:
pred.shape

(671, 9066)

In [71]:
user_rated_movies(df,new_sparse,32)

                                 count  mean
title                                       
Bully (2001)                         1   5.0
Office Space (1999)                  1   5.0
Roger & Me (1989)                    1   5.0
Caddyshack II (1988)                 1   5.0
American Movie (1999)                1   5.0
Shining, The (1980)                  1   5.0
Uncle Buck (1989)                    1   5.0
Punch-Drunk Love (2002)              1   5.0
Sex, Lies, and Videotape (1989)      1   5.0
Stand by Me (1986)                   1   4.0


In [72]:
top_10 = pred[2].argsort()[-10:][::-1]

In [73]:
recommend_movies_bootstrap(df,pred,32)

        mid                                title  \
11460  2288               American Beauty (1999)   
11680  2374                    Fight Club (1999)   
21849  1728                   Beetlejuice (1988)   
23412  2340      Ferris Bueller's Day Off (1986)   
23564  2398      Who Framed Roger Rabbit? (1988)   
25794  1486  There's Something About Mary (1998)   
26330  2164                  American Pie (1999)   
26441  2407          Being John Malkovich (1999)   
27007  3156              Meet the Parents (2000)   
56352  3228    O Brother, Where Art Thou? (2000)   

                                                  genres  
11460                                      Drama|Romance  
11680                        Action|Crime|Drama|Thriller  
21849                                     Comedy|Fantasy  
23412                                             Comedy  
23564  Adventure|Animation|Children|Comedy|Crime|Fant...  
25794                                     Comedy|Romance  
26330         

In [74]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,uid,movie_mean_rating,title,genres,mid,adjusted_rating
0,1,31,2.5,1260759144,2.55,0,3.178571,Dangerous Minds (1995),Drama,30,-0.05
1,7,31,3.0,851868750,3.465909,6,3.178571,Dangerous Minds (1995),Drama,30,-0.465909
2,31,31,4.0,1273541953,4.166667,30,3.178571,Dangerous Minds (1995),Drama,30,-0.166667
3,32,31,4.0,834828440,3.666667,31,3.178571,Dangerous Minds (1995),Drama,30,0.333333
4,36,31,3.0,847057202,3.615385,35,3.178571,Dangerous Minds (1995),Drama,30,-0.615385


# Test Bagging

In [89]:
from sklearn import metrics
def auc_score(predictions, test):
    '''
    This simple function will output the area under the curve using sklearn's metrics. 
    
    parameters:
    
    - predictions: your prediction output
    
    - test: the actual target result you are comparing to
    
    returns:
    
    - AUC (area under the Receiver Operating Characterisic curve)
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)


def calc_mean_auc(training_set, altered_users, predictions, test_set):
    '''
    This function will calculate the mean AUC by user for any user that had their user-item matrix altered. 
    
    parameters:
    
    training_set - The training set resulting from make_train, where a certain percentage of the original
    user/item interactions are reset to zero to hide them from the model 
    
    predictions - The matrix of your predicted ratings for each user/item pair as output from the implicit MF.
    
    altered_users - The indices of the users where at least one user/item pair was altered from make_train function
    
    test_set - The test set constucted earlier from make_train function
    
    
    
    returns:
    
    The mean AUC (area under the Receiver Operator Characteristic curve) of the test set only on user-item interactions
    there were originally zero to test ranking ability in addition to the most popular items as a benchmark.
    '''
    
    
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        pred=predictions[user,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        pred_auc = auc_score(pred, actual)
        pop_auc = auc_score(pop, actual)
        if np.isnan(pred_auc):
            pred_auc = 0
        if np.isnan(pop_auc):
            pop_auc = 0
        store_auc.append(pred_auc) # Calculate AUC for the given user and store
        popularity_auc.append(pop_auc) # Calculate AUC using most popular and score
    # End users iteration
    
    print(float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc)))
    return store_auc, popularity_auc

In [24]:
altered_users = np.unique(test.nonzero()[0])

In [29]:
training_row = bs_train[0,:].toarray().reshape(-1)
zero_inds = np.where(training_row==0)

In [43]:
pred[0,zero_inds].reshape(-1).shape

(9044,)

In [45]:
test[0,:].toarray()[0,zero_inds].reshape(-1).shape

(9044,)

In [27]:
pred[0].toarray()

array([-0.02106492,  0.00569918,  0.02313504, ...,  0.        ,
       -0.00095914, -0.0027684 ])

In [84]:
pred = np.nan_to_num(pred)

In [90]:
s,p = calc_mean_auc(bs_train,altered_users,pred,test)



0.896 0.912


In [92]:
m = train_model(df,train)
user_f = m.user_factors
item_f = m.item_factors
pred_matrix = item_f.dot(user_f.T).T
s,p = calc_mean_auc(train,altered_users,pred_matrix,test)



0.907 0.92
