In [12]:
import os

topic_file = os.getenv("TOPIC_PATH")
users_file = os.getenv("USERS_PATH")
ratings_file = os.getenv("RATINGS_PATH")

#topic_file = "/data/books_data/books.csv"
#users_file = "/data/books_data/users.csv"
#ratings_file = "/data/books_data/rating.csv"

In [13]:
import pandas as pd
import numpy as np
topics = pd.read_csv(topic_file)

In [14]:
ratings = pd.read_csv(ratings_file)
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [15]:
users = pd.read_csv(users_file)
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [16]:
user_rating = pd.merge(users,ratings,on='UserID')
user_rating_mv = pd.merge(user_rating, topics, left_on="MovieID", right_on="ID") 

In [17]:
user_rating_mv.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,ID,NAME,CATEGORY
0,1,F,1,10,48067,1193,5,978300760,1193,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,1193,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,1193,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,1193,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,1193,One Flew Over the Cuckoo's Nest (1975),Drama


In [7]:
final_df_matrix = user_rating_mv.pivot(index='UserID',
                                 columns='MovieID',
                                 values='Rating').fillna(0)

In [8]:
user_ratings_mean = np.mean(final_df_matrix.values, axis=1)
ratings_demeaned = final_df_matrix.values - user_ratings_mean.reshape(-1, 1)

In [11]:
n_users = user_rating_mv['UserID'].nunique()
n_movies = user_rating_mv['MovieID'].nunique()
sparsity = round(1.0 - user_rating_mv.shape[0] / float(n_users * n_movies), 3)
print('The sparsity level of dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of dataset is 95.5%


In [10]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(ratings_demeaned, k=50)  # Number of singular values and vectors to compute

In [13]:
sigma = np.diag(sigma)

In [14]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [15]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = final_df_matrix.columns)

preds.head()

MovieID,1,3,4,5,7,8,9,10,11,14,...,17367,17369,17370,17371,17372,17373,17374,17378,17379,17384
0,-0.00618,0.204838,0.063988,0.207591,0.000822,0.051894,-0.129811,0.259161,0.008568,1.052304,...,-0.002802,0.01331,0.145207,-0.005154,0.11584,0.099585,-0.009554,-0.006798,-0.010075,-0.006153
1,0.02695,-0.056118,0.254517,0.197771,-0.003082,0.083399,0.167836,0.468777,0.005286,0.951788,...,-0.007681,-0.039402,-0.005458,-0.001975,-0.104908,-0.091194,0.005701,0.038644,-0.050957,0.012427
2,0.026844,0.133013,0.230149,-0.035236,0.003061,0.236228,0.107544,0.255164,-0.000162,0.642016,...,0.093736,0.067307,0.055685,0.000213,0.131037,-0.052523,0.013288,0.030375,0.016879,0.021292
3,0.002599,0.016706,-0.09701,0.016843,-0.001963,0.038649,0.007276,0.038762,0.007354,0.066394,...,-0.014459,-0.004221,-0.00023,-0.000263,0.023349,-0.002979,-0.004273,-0.004031,-0.001149,0.007001
4,0.006748,0.080554,0.151829,0.266858,-0.003909,0.128621,0.05823,-0.028054,0.005481,0.329677,...,0.016014,-0.011357,0.001018,0.001688,0.016706,-0.002336,0.001286,-0.001304,-0.006112,-0.002839


In [16]:
from mosaicml import *
from mosaicml.constants import MLModelFlavours
import numpy
import pandas as pd

@scoring_func
def score(model, request):
    payload = request.json["payload"]
    print(payload)     
    global topics
    global ratings
    global preds
    prediction = preds
    reviews = ratings
    movies = topics
    num_recommendations = payload["num_rec"]
    userID = payload["user_id"]

    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = reviews[reviews.UserID == (userID)]
    
    user_full = (user_data.merge(movies, how = 'left', left_on = 'MovieID', right_on='ID').
                     sort_values(['Rating'], ascending=False)
                 )
    
    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    #Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['ID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'ID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return {"watched" : user_full.head(num_recommendations).NAME.to_list(), "recommended" : recommendations.NAME.to_list() }

def recommend_movies(userID):
    
    global topics
    global ratings
    global preds
    prediction = preds
    reviews = ratings
    movies = topics
    num_recommendations = 10

    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = reviews[reviews.UserID == (userID)]
    
    user_full = (user_data.merge(movies, how = 'left', left_on = 'MovieID', right_on='ID').
                     sort_values(['Rating'], ascending=False)
                 )
    
    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    #Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['ID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'ID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full.head(10), recommendations  # then sort by newest release year



In [18]:
user_already_rated, for_recommend = recommend_movies(192)

User 192 has already rated 21 movies.
Recommending highest 10 predicted ratings movies not already rated.


In [19]:
modl_ob = register_model(None, score, name="User_Recommendation", 
                    description="Recommend items for an user based on his historical ratings.",
                    flavour=MLModelFlavours.sklearn,  pretty_output=False)

VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

In [61]:
model_id = modl_ob["id"]
version_id = modl_ob["versions"][len(modl_ob["versions"])-1]["id"]
deploy_model(model_id, version_id, "f219837e-9d30-4aec-9937-4c0e925fbf57")

Unnamed: 0,ID,NAME,CATEGORY,MovieID
2287,2424,You've Got Mail (1998),Comedy|Romance,2424.0
319,339,While You Were Sleeping (1995),Comedy|Romance,339.0
553,587,Ghost (1990),Comedy|Romance|Thriller,587.0
1661,1777,"Wedding Singer, The (1998)",Comedy|Romance,1777.0
667,708,"Truth About Cats & Dogs, The (1996)",Comedy|Romance,708.0
2186,2321,Pleasantville (1998),Comedy,2321.0
1791,1923,There's Something About Mary (1998),Comedy,1923.0
1239,1307,When Harry Met Sally... (1989),Comedy|Romance,1307.0
3106,3255,"League of Their Own, A (1992)",Comedy|Drama,3255.0
479,509,"Piano, The (1993)",Drama|Romance,509.0
