In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV

from collections import defaultdict, Counter
import recmetrics

In [6]:
raw_movies = pd.read_csv('C:/Users/saleh/Desktop/SM/Data/1M/ml-latest-small/movies.csv')
raw_ratings = pd.read_csv('C:/Users/saleh/Desktop/SM/Data/1M/ml-latest-small/ratings.csv')

In [3]:
def process_data(movies,ratings, movie_threshold):
    '''
    Function for processing the movies and rating data set. 
    It removes duplicates (if any), and outputs the final ratings dataframe.
    It removes any movies with less than the threshold number of ratings.
    '''

    raw_movies = movies
    
    if raw_movies.movieId.nunique()  != raw_movies.title.nunique():
        print('There are some duplicate titles')
        print('\n')
        print('Removing the following duplicate titles \n')
        v = raw_movies.title.value_counts()
        for i,movie in enumerate(list(raw_movies[raw_movies.title.isin(v.index[v.gt(1)])].title.unique())):
            print('{0}. {1}'.format(i+1,movie))
         
    else:
        print('No duplicate titles')
        processed_movies = raw_movies
        
        
    # check if any user has rated the same movie more than once
    print('\n')
    if ratings.duplicated(subset=['userId','movieId']).any():
        print('There are duplicate ratings for same movies by some users')
    else:
        print('There are no duplicate ratings')
        
    # Remove duplicate movie Ids from ratings data
    s = raw_movies.title.value_counts()
    dup_movies = list(raw_movies[raw_movies.title.isin(s.index[s.gt(1)])].movieId.unique())
    ratings1 = ratings.drop(ratings[ratings.movieId.isin(dup_movies)].index)
    
    # Filter data for movies with atleast 10 ratings

    c = Counter(ratings1.movieId)
    relevant_items = [k for k, count in c.items() if count >= movie_threshold]

    print(np.shape(relevant_items))
    filtered_ratings = ratings1.loc[ratings1.movieId.isin(relevant_items),:]
    
    print('\n')
    print('Total Movies = {}'.format(movies.movieId.nunique()))
    print('Filtered Movies = {}'.format(filtered_ratings.movieId.nunique()))
    
    
    return filtered_ratings[['userId','movieId','rating']]

In [7]:
ratings = process_data(raw_movies,raw_ratings,10)
ratings.head()

There are some duplicate titles


Removing the following duplicate titles 

1. Emma (1996)
2. Saturn 3 (1980)
3. Confessions of a Dangerous Mind (2002)
4. Eros (2004)
5. War of the Worlds (2005)


There are no duplicate ratings
(2266,)


Total Movies = 9742
Filtered Movies = 2266


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
ratings.shape

(81021, 3)

In [9]:
# Convert data into a surprise dataframe from pandas
reader = Reader(rating_scale=(-1, 5))
data = Dataset.load_from_df(ratings, reader)

In [8]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), NormalPredictor(), KNNWithZScore(), KNNWithMeans(), BaselineOnly(), SlopeOne(), NMF(),CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=4)
    print('\n {} Done! \n'.format(algorithm))
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)


 <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001F724205780> Done! 


 <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001F724205B70> Done! 

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.

 <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000001F724205908> Done! 

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.

 <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001F724205978> Done! 

Estimating biases using als...
Estimat

In [9]:
benchmark
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.851633,6.674611,0.276563
BaselineOnly,0.852446,0.187035,0.198443
KNNWithZScore,0.855247,0.451507,4.098663
KNNWithMeans,0.85623,0.37908,3.60533
SlopeOne,0.857689,1.912584,8.994021
NMF,0.881182,7.401268,0.256774
CoClustering,0.901045,3.108642,0.309699
NormalPredictor,1.392487,0.171441,0.240363


In [14]:
model = SVD(n_factors= 100,n_epochs= 20,lr_all= 0.005,reg_all= 0.02)
results = cross_validate(model, data, measures=['RMSE'], cv=4, verbose=False)

Using ALS


In [22]:
#results.keys()
print(np.mean(results['test_rmse']))
print(np.mean(results['fit_time']))
print(np.mean(results['test_time']))

0.8524706135102402
7.311672270298004
0.29663753509521484


In [32]:
svd_options = {'n_factors': [50,100,200],
               'n_epochs': [10,20,30],
               'lr_all': [0.0025,0.005,0.0075],
               'reg_all': [0.01,0.02,0.03]
               }

gs = GridSearchCV(SVD, param_grid=svd_options, measures=["rmse"], cv=4)

gs.fit(data)

gs_results = pd.DataFrame(gs.cv_results)


In [35]:
#gs_results.sort_values('rank_test_rmse')

gs.best_params["rmse"]

{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.0075, 'reg_all': 0.03}

In [None]:

#print('Using ALS')
#bsl_options = {'method': 'als',
#               'n_epochs': 10,
#               'reg_u': 15,
#               'reg_i': 10
#               }
#model = BaselineOnly(bsl_options=bsl_options)
#cross_validate(model, data, measures=['RMSE'], cv=4, verbose=False)

In [10]:
trainset, testset = train_test_split(data, test_size=0.25)
model = SVD(lr_all=0.0075, reg_all=0.03)
predictions = model.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.8474


0.8473619546735387

In [11]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['userid', 'itemid', 'rating', 'prediction', 'details'])
df['user_ratings'] = df.userid.apply(get_Iu)
df['item_ratings'] = df.itemid.apply(get_Ui)
df['error'] = abs(df.prediction - df.rating)

In [12]:
df.head()
df.groupby('userid').min().head()

Unnamed: 0_level_0,itemid,rating,prediction,user_ratings,item_ratings,error
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,3.0,3.321723,153,8,0.00589
2,318,3.0,3.212689,20,23,0.279111
3,527,0.5,1.504479,19,8,1.004479
4,162,1.0,2.910559,138,6,0.164515
5,36,2.0,3.153441,34,13,0.153441


In [23]:
df.sort_values('error',ascending=False).head(10)

Unnamed: 0,userid,itemid,rating,prediction,details,user_ratings,item_ratings,error
11352,573,44199,0.5,4.593563,{'was_impossible': False},204,32,4.093563
9964,258,122886,0.5,4.526511,{'was_impossible': False},18,28,4.026511
10725,573,1127,0.5,4.485909,{'was_impossible': False},204,48,3.985909
459,393,27611,0.5,4.479954,{'was_impossible': False},82,12,3.979954
13119,580,1203,0.5,4.330564,{'was_impossible': False},281,46,3.830564
6482,594,253,0.5,4.310163,{'was_impossible': False},116,84,3.810163
9409,393,778,0.5,4.308561,{'was_impossible': False},82,82,3.808561
12844,105,4027,0.5,4.307591,{'was_impossible': False},356,66,3.807591
4528,258,87232,0.5,4.263088,{'was_impossible': False},18,31,3.763088
10293,543,59900,0.5,4.209045,{'was_impossible': False},48,9,3.709045


In [None]:
ratings[ratings['movieId'] == 2488]['rating'].describe()

In [None]:
ratings.loc[ratings['movieId'] == 2488]['rating'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings Movie 2488 has received')
plt.show();

In [None]:
ratings.loc[ratings['userId'] == 594]['rating'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings user 594 has given')
plt.show();

In [13]:
def all_recos(trainset,model):
    '''Return the full list of user-item pairs not present in the training data along with the predicted ratings.
    Args:
        trainset generated from surprise data.
        Model, which will be used for predictions.
    Returns:
    A list of recommendations [userid,itemid,prediction]
    '''
    recos = trainset.build_anti_testset()
    
    final_recos = []
    
    for uid,iid,_ in recos:
        row = [uid,iid,model.predict(uid,iid)[3]]
        final_recos.append(row)
    
    return final_recos

In [14]:


#Function for getting top n predictions for each user from a set of predictions
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    
    predictions.sort(key=lambda x: x[0], reverse=False)

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, est in predictions:
        top_n[uid].append((iid, est))
    
    top_n_list = []

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_list.append(user_ratings[:n])
        top_n[uid] = user_ratings[:n]

    return top_n_list, top_n

In [15]:
#Function for getting a sorted dictionary of top rated items from a user's testset
def create_testset_dict(testset):
    '''
    Return the items rated by a given user within the testset, sorted in descending order.
    
    :params: testset: A testset generated via Surprise
    : returns testset_dict: a collection of users, items which they rated and the given rating
    '''
    testset_dict = defaultdict(list)
    
    testset.sort(key=lambda x: x[0], reverse=False)
    
    for row in testset:
        uid, iid, gt_rating = row
        testset_dict[uid].append((iid, gt_rating))
        
    testset_list = []
    for uid, user_ratings in testset_dict.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        testset_list.append(user_ratings)
        testset_dict[uid] = user_ratings
        
    return testset_list, testset_dict

#Function for getting all items from surprise trainset
def all_trainset_items(trainset):
    '''
    Function for creating a list of all trainset items
    params: trainset (surprise trainset object)
    :returns all_items (list)
    '''
    all_items = [trainset.to_raw_iid(item) for item in trainset.all_items()]
    return all_items


#Function for getting all items from predictions
def all_pred_items(recos):
    '''
    Function for creating a list of all trainset items
    params: trainset (surprise trainset object)
    :returns all_items (list)
    '''
    all_pred_items = list(set([iid for uid,iid,_ in recos]))
    return all_pred_items


In [16]:


def build_full_testset(surprise_data):
    '''
    Function for creating a full testset object from surprise data
    params: surprise data (surprise dataset)
    :returns full_test (a full testet)
    '''
    raw_ratings = surprise_data.raw_ratings
    full_test = surprise_data.construct_testset(raw_ratings)
    return full_test

def get_mark(predictions, actuals, k):
    '''
    Function for calculating mean average recall @ k. 
    params: predictions (predicted item list), actuals (full item list), k (int)
    :returns mean_ark (float)
    '''
    mean_ark = recmetrics.mark(actual=actuals, predicted=predictions, k=k)
    return mean_ark

#Function for getting coverage
def get_coverage_score(predicted, catalog):
    '''
    Function for calculating coverage score. 
    params: predicted_items (predicted item list), trainset_items (full)
    :returns cov (float)
    '''
    
    
    predicted_flattened = [p[0] for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return coverage

#Function for getting personalization score
def get_personalization_score(predicted_items):
    '''
    Function for calculating personalization score.
    params: predicted_items (predicted item list)
    :returns pers (float)
    '''
    
    pred_list = []

    for sublist in predicted_items:
        pred_list.append([i[0] for i in sublist])
    
    pers = recmetrics.personalization(pred_list)
    return pers

#Function for getting intra-list similarity score
def get_intra_list_similarity(predicted_items,feature_df):
    '''
    Function for calculating intra-list similarity score
    params: predicted_items (predicted item list)
    :returns intra-list similarity score (float)
    '''
    
    pred_list = []

    for sublist in predicted_items:
        pred_list.append([i[0] for i in sublist])
    
    pers = recmetrics.intra_list_similarity(pred_list,feature_df)
    return pers

In [17]:

def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]
    
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        m, r = p
        if m in list(m for m,r in actual) and m not in list(m for m,r in predicted[:i]):
            #print('Yes')
            num_hits += 1.0
            score += num_hits / (i+1.0)
    return score / min(len(actual),k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
    #return [apk(a,p,k) for a,p in zip(actual, predicted)]

In [18]:
#Extract year from movie title

def get_item_feature_df(raw_movies):
    if 'genres' not in raw_movies.columns:
        raw_movies = pd.read_csv('C:/Users/saleh/Desktop/SM/Data/1M/ml-latest-small/movies.csv')

    
    # Perform one-hot encoding for genres and then drop the "no genres listed column"
    d = pd.get_dummies(raw_movies.genres)

    cols = [c for c in d.columns if '|' not in c]


    for genre in cols:
        raw_movies[genre] = raw_movies['genres'].str.contains(genre).astype(int)

    processed_movies = raw_movies.drop(['title','genres'], axis=1)
    processed_movies.set_index('movieId', inplace=True)
    
    return processed_movies

In [19]:
full_recommendations = all_recos(trainset,model)

In [20]:
#Creating Evaluation dict

top_n_list, top_n_dict = get_top_n(full_recommendations, 10)
testset_list, testset_dict = create_testset_dict(testset)
feature_df = get_item_feature_df(raw_movies)
trainset_items = all_trainset_items(trainset)
processed_movies = get_item_feature_df(raw_movies)

  from ipykernel import kernelapp as app


In [21]:
#from ml_metrics import average_precision
#average_precision.mapk(actual=testset_list, predicted=top_n_list)

In [22]:
# Get Model Scores

print('MAP@k: {}'.format(mapk(testset_list, top_n_list,5)))
print('Coverage Score: {}%'.format(get_coverage_score(top_n_list, trainset_items)))
print('Personalization Score: {}%'.format(round(get_personalization_score(top_n_list)*100,2)))
print('Intra-list Similarity Score: {}'.format(round(get_intra_list_similarity(top_n_list,processed_movies),2)))

MAP@k: 0.04397814207650273
Coverage Score: 15.58%
Personalization Score: 88.85%
Intra-list Similarity Score: 0.31
