In [2]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
from surprise import accuracy

In [6]:
# read ratings data
reader = Reader(line_format='user item rating timestamp',sep='::')
ratings = Dataset.load_from_file('ratings.dat',reader=reader)

# create 5 cross validation folds
ratings.split(n_folds=5)

# use SVD 
algo = SVD()

In [4]:
# get RMSE and MAE
performance = evaluate(algo,ratings,measures=['RMSE','MAE'])
print_perf(performance)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8707
MAE:  0.6835
------------
Fold 2
RMSE: 0.8722
MAE:  0.6846
------------
Fold 3
RMSE: 0.8735
MAE:  0.6853
------------
Fold 4
RMSE: 0.8756
MAE:  0.6873
------------
Fold 5
RMSE: 0.8766
MAE:  0.6889
------------
------------
Mean RMSE: 0.8737
Mean MAE : 0.6859
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.8707  0.8722  0.8735  0.8756  0.8766  0.8737  
MAE     0.6835  0.6846  0.6853  0.6873  0.6889  0.6859  


In [7]:
# train the model and get the predictions for test set
trainset = ratings.build_full_trainset()
algo.train(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)

# get the RMSE and MAE of test set
RMSE_TEST = accuracy.rmse(predictions, verbose=True)
MAE_TEST = accuracy.mae(predictions, verbose=True)

print("RMSE_TEST : ",RMSE_TEST,"\n","MAE_TEST : ",MAE_TEST)

RMSE: 0.6713
MAE:  0.5298
RMSE_TEST :  0.671278092715 
 MAE_TEST :  0.529800747632


In [8]:
# define a function a get top N recommendation
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [9]:
top_n = get_top_n(predictions, n=10)
top_n_series = pd.Series(top_n)
top_n_df = pd.DataFrame(top_n_series,columns=['top 10 recommendations'])
top_n_df.index.rename('user_id')
top_n_df.reset_index(inplace=True)
top_n_df.columns=['user_id','top 10 recommendations']

top_n_df.head(5)

Unnamed: 0,user_id,top 10 recommendations
0,1,"[(527, 4.83015697612), (2028, 4.78377094079), ..."
1,10,"[(2762, 5), (1584, 4.97431743318), (2797, 4.95..."
2,100,"[(919, 4.03605918684), (527, 4.01204988152), (..."
3,1000,"[(318, 4.84127839622), (527, 4.76589701566), (..."
4,1001,"[(2858, 5), (608, 5), (3822, 5), (3160, 5), (1..."


In [11]:
#Reading movie file
all_tags = ['unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = []
with open('movies.dat', "r", encoding='latin-1') as reader:
    for line in reader:
        vals = line.split("::")
        tags = vals[2].strip().split("|")
        row = {"movie id": vals[0], "movie title": vals[1]}
        for tag in all_tags:
            if tag in tags: row[tag] = 1
            else: row[tag] = 0
        movies.append(row)
    reader.close()
movies = pd.DataFrame(movies, columns=["movie id", "movie title"]+all_tags)
movies = movies.rename(columns={"movie id": "movie_id","movie title":"movie_title"})

movies.head(5)

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# get the recommended items for each user
user_df = pd.DataFrame(columns=['user_id'])
movie_df = pd.DataFrame(columns=['movie_id'])
rating_df = pd.DataFrame(columns=['rating'])
#for uid, user_ratings in top_n.items():
#    user.append(uid)

for user_number in range(0,6040):
    for x in range(0,10):
        movie_df.set_value(x + user_number*10,'movie_id',top_n_df.iloc[user_number][1][x][0])
        rating_df.set_value(x + user_number*10,'rating',top_n_df.iloc[user_number][1][x][1])
        user_df.set_value(x + user_number*10,'user_id',top_n_df.iloc[user_number][0])
   
recommendations = pd.DataFrame(columns={'user_id','movie_id','rating'})
recommendations['user_id']=user_df
recommendations['movie_id']=movie_df
recommendations['rating']=rating_df

recommendations_all = pd.merge(recommendations,movies,how='left',on=['movie_id'])
recommendations_all['movie_id'] = pd.to_numeric(recommendations_all['movie_id'])
recommendations_all['user_id'] = pd.to_numeric(recommendations_all['user_id'])

recommendations_all.head(10)

Unnamed: 0,user_id,movie_id,rating,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,527,4.83016,Schindler's List (1993),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,2028,4.78377,Saving Private Ryan (1998),0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1207,4.67167,To Kill a Mockingbird (1962),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,150,4.6497,Apollo 13 (1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,4.58052,Toy Story (1995),0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,1,2804,4.51208,"Christmas Story, A (1983)",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1035,4.46609,"Sound of Music, The (1965)",0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,1,2762,4.45216,"Sixth Sense, The (1999)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,1,1961,4.41578,Rain Man (1988),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,3105,4.41403,Awakenings (1990),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
recommendations_all.tail(10)

Unnamed: 0,user_id,movie_id,rating,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
60390,999,1721,4.86977,Titanic (1997),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
60391,999,318,4.75632,"Shawshank Redemption, The (1994)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60392,999,1101,4.71379,Top Gun (1986),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
60393,999,110,4.64471,Braveheart (1995),0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
60394,999,733,4.63292,"Rock, The (1996)",0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
60395,999,1036,4.61314,Die Hard (1988),0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
60396,999,3578,4.60802,Gladiator (2000),0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60397,999,2028,4.59497,Saving Private Ryan (1998),0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
60398,999,593,4.52835,"Silence of the Lambs, The (1991)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
60399,999,1704,4.51992,Good Will Hunting (1997),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# get the precision and recall
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

# Precision and recall can then be averaged over all users
print("Precision : ", sum(prec for prec in precisions.values()) / len(precisions) * 100,"%")
print("Recall : ", sum(rec for rec in recalls.values()) / len(recalls) * 100,"%")

Precision :  98.6891162094 %
Recall :  19.2766344161 %
