In [10]:
import numpy as np
from numpy.random import choice
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix, csr_matrix, lil_matrix

def load_movielens(mean_adjust = 'user'):
    '''Loads movielens set into a Dataframe. Adjusted rating is user by default'''
    path = 'ml-latest-small\\'
    df = pd.read_csv(path + 'ratings.csv')
    movie_xref = pd.read_csv(path + 'movies.csv')
    users = df.groupby('userId').mean()['rating'].reset_index()
    users['user_mean_rating'] = users['rating']
    users['uid'] = users.index
    users.drop('rating',inplace=True,axis=1)
    df = df.merge(users,left_on='userId',right_on='userId')
    movies = df.groupby('movieId').mean()['rating'].reset_index()
    movies['movie_mean_rating'] = movies['rating']
    movies = movies.merge(movie_xref,left_on='movieId',right_on='movieId')
    movies.drop('rating',inplace=True,axis=1)
    movies['mid'] = movies.index
    df = df.merge(movies,left_on='movieId',right_on='movieId')
    if mean_adjust == 'user':
        df['adjusted_rating'] = df['rating'] - df['user_mean_rating']
    elif mean_adjust == 'movie':
        df['adjusted_rating'] = df['rating'] - df['movie_mean_rating']
    return df

def test_sparse(sparse_matrix):
    n = sparse_matrix.nnz
    s = sparse_matrix.shape
    return 1-n/(s[0]*s[1])

def train_test_split(sparse_matrix,pct):
    '''Splits a sparse matrix into two sets - a train set and a test set'''
    nz_coord = list(zip(*sparse_matrix.nonzero()))
    nz_coord_array = np.array(nz_coord)
    num_rand = int(np.floor(len(nz_coord)*(pct/100.0)))
    n = len(nz_coord)
    c = choice(n,size=num_rand,replace=False)
    rows = nz_coord_array[c][:,0]
    cols = nz_coord_array[c][:,1]
    vals = np.array(sparse_matrix[rows,cols]).flatten()
    train = sparse_matrix.copy()
    test = lil_matrix(train.shape)
    test[rows,cols] = vals
    test = test.tocsr()
    train[rows,cols] = 0
    train.eliminate_zeros()
    return train,test

df = load_movielens()
users_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
df_sparse = csr_matrix((ratings_list,(users_list,movies_list)))
train,test = train_test_split(df_sparse,20)


In [148]:
movie_xref = df[['mid','title','genres']]
movie_xref = movie_xref.drop_duplicates()

# Bagging

In [8]:
import implicit

In [9]:
def bootstrap(sparse_matrix):
    '''
        Row-wise bootstrapping of the sparse matrix
        Returns the sampled rows as after CF we'll need to average out ratings over any 
        duplicated users
    '''
    rows = sparse_matrix.shape[0]
    rows_to_sample = np.random.choice(rows,rows)
    bootstrapped_matrix = sparse_matrix[rows_to_sample,:]
    return bootstrapped_matrix, rows_to_sample

In [24]:
user_list = df['uid']
movies_list = df['mid']
ratings_list = df['rating']
ratings_list_implicit = [1 for i in ratings_list]
new_sparse = csr_matrix((ratings_list,(user_list,movies_list)))
#1 or 0 each value
#new_sparse = new_sparse.sign()
train,test = train_test_split(new_sparse,20)

In [25]:
bs_train, bs_rows = bootstrap(train)

In [26]:
bs_train

<671x9066 sparse matrix of type '<class 'numpy.float64'>'
	with 79215 stored elements in Compressed Sparse Row format>

In [155]:
def user_rated_movies(df,sparse_matrix,uid,cluster=False):
    '''uid can be cluster or user id'''
    mid = sparse_matrix[uid].nonzero()[1]
    if cluster:
        user_string = 'cluster'
    else:
        user_string = 'uid'    
    df_rows = df.loc[(df[user_string]==uid)&(df['mid'].isin(train[uid].nonzero()[1])),:]
    gb = df_rows.groupby('title').agg(['count','mean'])['rating']
    gb = gb.sort_values(by='mean',ascending=False)
    print(gb[:10])
    
def train_model(df,train):
    factors = 20
    regularization = 0.1
    iterations = 50
    model = implicit.als.AlternatingLeastSquares(factors=factors,regularization=regularization,iterations=iterations)

    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(train.T)
    return model

def bootstrap_to_user_item_matrix(bootstrap_model, bootstrap_rows):
    '''
    Each bootstrapped fit model will have some duplicate rows in it
    This function will recreate a user-item matrix so that there are no duplicates
    '''
    new_pred_matrix = np.zeros((m.user_factors.shape[0],m.item_factors.shape[0]))
    user_f = bootstrap_model.user_factors
    item_f = bootstrap_model.item_factors
    pred_matrix = item_f.dot(user_f.T).T
    for i in range(new_pred_matrix.shape[0]):
        bs_dups = np.where(bootstrap_rows==i)[0]
        if len(bs_dups) > 0:
            model_pred = pred_matrix[bs_dups].mean(axis=0)
            new_pred_matrix[i] = model_pred
    return new_pred_matrix
        

def bag_recommendations(df, bootstraps_models, bootstraps_rows,shape):
    '''
        df is original dataframe
        bootstraps is list of bootstrapped matrices
        bootstraps_rows is list of rows sampled for each bootstrap
    '''
    bagged_model = np.zeros(shape)
    user_ct_for_avg = np.zeros(shape[0])
    for i in range(len(bootstraps_models)):
        bs_model = bootstraps_models[i]
        bs_rows = bootstraps_rows[i]
        pred_matrix = bootstrap_to_user_item_matrix(bs_model,bs_rows)
        bagged_model = np.add(bagged_model,pred_matrix)
        users_for_this_model = np.unique(pred_matrix.nonzero()[0])
        user_ct_for_avg[users_for_this_model] += 1
    return bagged_model/user_ct_for_avg[:,None]
    
def recommend_movies_bootstrap(df,prediction_matrix,uid):
    '''Predict what a user (or cluster) would like based on trained model'''
    top_10 = prediction_matrix[uid].argsort()[-10:][::-1]
    movie_xref = df[['mid','title','genres']]
    movie_xref = movie_xref.drop_duplicates()
    print(movie_xref[movie_xref['mid'].isin(top_10)])

In [38]:
#Need to figure out the text from Aggarwal:

##For each training data set, an item rating can be predicted
##for a user only if that user is represented at least once in the matrix. In such a case,
##the predicted rating from that ensemble component is the average rating of that item
##over the duplicate occurrences of that user. 

In [68]:
new_pred_matrix.shape

(671, 9066)

In [33]:
m = train_model(df,bs_train)



In [72]:
np.where(bs_rows==9000)[0]

array([], dtype=int64)

In [49]:
user_f = m.user_factors
item_f = m.item_factors
pred_matrix = item_f.dot(user_f.T).T

In [83]:
b = bootstrap_to_user_item_matrix(m,bs_rows)

In [98]:
user_ct_for_avg = np.zeros(671)

In [99]:
users_for_this_model = np.unique(b.nonzero()[0])

In [102]:
user_ct_for_avg[users_for_this_model] += 1

In [105]:
models = []
rows = []
for i in range(5):
    bs_train, bs_rows = bootstrap(train)
    m = train_model(df,bs_train)
    models.append(m)
    rows.append(bs_rows)



In [107]:
pred_matrix.shape

(671, 9066)

In [122]:
shape = (671, 9066)
pred = bag_recommendations(df,models,rows,shape)

In [123]:
pred.shape

(671, 9066)

In [162]:
user_rated_movies(df,new_sparse,32)

                                 count  mean
title                                       
Best in Show (2000)                  1   5.0
American Movie (1999)                1   5.0
Roger & Me (1989)                    1   5.0
Poltergeist (1982)                   1   5.0
Caddyshack II (1988)                 1   5.0
Bully (2001)                         1   5.0
Sex, Lies, and Videotape (1989)      1   5.0
Shining, The (1980)                  1   5.0
Novocaine (2001)                     1   4.0
Pollock (2000)                       1   4.0


In [154]:
top_10 = pred[2].argsort()[-10:][::-1]

array([ 266,  525,  321,  284,  472, 2288, 2374, 2062,  232,  100], dtype=int64)

In [163]:
recommend_movies_bootstrap(df,pred,32)

        mid                                         title  \
11680  2374                             Fight Club (1999)   
18163  1013                            Stand by Me (1986)   
18804  1039                               Heathers (1989)   
20439  1530                    Breakfast Club, The (1985)   
22637  2147  Austin Powers: The Spy Who Shagged Me (1999)   
26224  2004                           Office Space (1999)   
26330  2164                           American Pie (1999)   
51194  1905                               Rushmore (1998)   
51727  2082                               Election (1999)   
58001  3860                  Royal Tenenbaums, The (2001)   

                            genres  
11680  Action|Crime|Drama|Thriller  
18163              Adventure|Drama  
18804                       Comedy  
20439                 Comedy|Drama  
22637      Action|Adventure|Comedy  
26224                 Comedy|Crime  
26330               Comedy|Romance  
51194                 Comedy|Dra

In [139]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,uid,movie_mean_rating,title,genres,mid,adjusted_rating
0,1,31,2.5,1260759144,2.55,0,3.178571,Dangerous Minds (1995),Drama,30,-0.05
1,7,31,3.0,851868750,3.465909,6,3.178571,Dangerous Minds (1995),Drama,30,-0.465909
2,31,31,4.0,1273541953,4.166667,30,3.178571,Dangerous Minds (1995),Drama,30,-0.166667
3,32,31,4.0,834828440,3.666667,31,3.178571,Dangerous Minds (1995),Drama,30,0.333333
4,36,31,3.0,847057202,3.615385,35,3.178571,Dangerous Minds (1995),Drama,30,-0.615385
