In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.decomposition import NMF
# use surprise package with much simpler interface
import surprise

from surprise import NMF
from surprise.reader import Reader
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from collections import defaultdict

In [5]:
ratings = pd.read_csv('data/ratings.dat', sep='::',
                      names=['user_id', 'movie_id', 'rating', 'rating_timestamp'],engine="python"
                      )
#.set_index("user_id") #
ratings=ratings.drop(["rating_timestamp"], axis=1)
ratings.describe()
# break into smaller subset for training; larger subsets take very long on this workstation to train
train=ratings.sample(frac=0.02,random_state=200)
train['rating']=train['rating']+1
train.describe()

Unnamed: 0,user_id,movie_id,rating
count,17769.0,17769.0,17769.0
mean,34889.802296,2177458.0,8.295402
std,20010.236408,2010533.0,1.87273
min,10.0,10.0,1.0
25%,17964.0,770828.0,7.0
50%,34979.0,1703957.0,8.0
75%,51842.0,2872718.0,10.0
max,69319.0,12724620.0,11.0


In [6]:
# Use surprise convenience library on scikit-learn to perform NMF
reader = Reader(rating_scale=(1, 11)) #line_format by default order of the fields
data = Dataset.load_from_df(train[["user_id",	"movie_id",	"rating"]], reader=reader)

print("Loaded dataset into surprise")
trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()

algo_NMF = NMF(n_factors = 2)
algo_NMF.fit(trainset)


# Predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo_NMF.test(testset)

Loaded dataset into surprise


In [15]:
def get_top_n(predictions, userId, movies_df, ratings_df, n = 10):
    '''Return the top N (default) movieId for a user,.i.e. userID and history for comparisom
    Args:
    Returns: 
  
    '''
    #Peart I.: Surprise docomuntation
    
    #1. First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    #2. Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[: n ]
    
    #Part II.: inspired by: https://beckernick.github.io/matrix-factorization-recommender/
    
    #3. Tells how many movies the user has already rated
    user_data = ratings_df[ratings_df.user_id == (userId)]
    print('User {0} has already rated {1} movies.'.format(userId, user_data.shape[0]))

    
    #4. Data Frame with predictions. 
    preds_df = pd.DataFrame([(id, pair[0],pair[1]) for id, row in top_n.items() for pair in row],
                        columns=["user_id" ,"movie_id","rating_pred"])
    
    
    #5. Return pred_usr, i.e. top N recommended movies with (merged) titles and genres. 
    pred_usr = preds_df[preds_df["user_id"] == (userId)].merge(movies_df, how = 'left', left_on = 'movie_id', right_on = 'movie_id')
            
    #6. Return hist_usr, i.e. top N historically rated movies with (merged) titles and genres for holistic evaluation
    hist_usr = ratings_df[ratings_df.user_id == (userId) ].sort_values("rating", ascending = False).merge\
    (movies_df, how = 'left', left_on = 'movie_id', right_on = 'movie_id')
    
    
    return hist_usr, pred_usr

In [39]:
# calculate prediction top n for first 5 users
movies = pd.read_csv('data/movies.dat', sep='::',
                      names=['movie_id','movie_title_year','genres'],engine="python"
    )
preds=[]
#pred_users=ratings.user_id.unique()[:2].tolist()
# chose two users with robust number of ratings
pred_users=[17694,34979]

for user_id in pred_users:
    hist_NMF_1, pred_NMF_1 = get_top_n(predictions, movies_df = movies, userId = user_id, 
                                       ratings_df = ratings)
    preds.append(pred_NMF_1)

User 17694 has already rated 5 movies.
User 34979 has already rated 86 movies.


In [48]:
# user  rated movies (sample)
preds[1].describe()

preds[1].head(20)


Unnamed: 0,user_id,movie_id,rating_pred,movie_title_year,genres
0,34979,2345112,8.07452,Parkland (2013),Drama|History|Mystery|Thriller
1,34979,5434972,6.75036,Fastball (2016),Documentary|Sport
2,34979,317219,6.748894,Cars (2006),Animation|Comedy|Family|Fantasy|Sport
3,34979,111495,6.184564,Trois couleurs: Rouge (1994),Drama|Mystery|Romance
4,34979,1800741,5.828828,Step Up Revolution (2012),Drama|Music|Romance
5,34979,116922,5.759917,Lost Highway (1997),Mystery|Thriller
6,34979,2914386,5.720653,Tu seras un homme (2013),Comedy|Drama
7,34979,87884,5.705186,"Paris, Texas (1984)",Drama
8,34979,2094877,5.694312,Les saveurs du Palais (2012),Biography|Comedy
9,34979,145681,5.675238,The Bone Collector (1999),Crime|Drama|Mystery|Thriller


In [49]:
# display movies recommended for first user
preds[1].head(10)

Unnamed: 0,user_id,movie_id,rating_pred,movie_title_year,genres
0,34979,2345112,8.07452,Parkland (2013),Drama|History|Mystery|Thriller
1,34979,5434972,6.75036,Fastball (2016),Documentary|Sport
2,34979,317219,6.748894,Cars (2006),Animation|Comedy|Family|Fantasy|Sport
3,34979,111495,6.184564,Trois couleurs: Rouge (1994),Drama|Mystery|Romance
4,34979,1800741,5.828828,Step Up Revolution (2012),Drama|Music|Romance
5,34979,116922,5.759917,Lost Highway (1997),Mystery|Thriller
6,34979,2914386,5.720653,Tu seras un homme (2013),Comedy|Drama
7,34979,87884,5.705186,"Paris, Texas (1984)",Drama
8,34979,2094877,5.694312,Les saveurs du Palais (2012),Biography|Comedy
9,34979,145681,5.675238,The Bone Collector (1999),Crime|Drama|Mystery|Thriller


In [43]:

#user_id=pred_NMF_1['user_id'].values[0]
#pred_NMF_1.to_csv('charts/prediction_user_'+ str(user_id) +'.csv',index=False)

# Construct the style as requested in assignment: Userid content_id1:value1 content_id3:value2

for pred_one_user in preds:
    df = pd.DataFrame(columns=["user_id", "predictions"])

    ratings_str=""
    for index, row in pred_one_user.iterrows():
        ratings_str = ratings_str + ' ' + str(row['movie_id']) + ":" + str(row['rating_pred']) 
    
    df = df.append({
         "user_id": user_id,
         "predictions":  ratings_str
          }, ignore_index=True)

df.to_csv('charts/prediction_all'+'.csv',index=False)

In [44]:
print('''
I used surprise package which provides a convenience wrapper on sklearn to implement the recommendation engine

I did hit limitations on training data size so had to limit the training dataset size to only 2% of the total ratings corpus

This does limit the number of ratings we saw for a given user, which result in very little data and limited 
recommendations. The result will be much higher quality if we can have the model train on larger subset (or full ratings set)

''')


I used surprise package which provides a convenience wrapper on sklearn to implement the recommendation engine

I did hit limitations on training data size so had to limit the training dataset size to only 2% of the total ratings corpus

This does limit the number of ratings we saw for a given user, which result in very little data and limited 
recommendations. The result will be much higher quality if we can have the model train on larger subset (or full ratings set)


