## Content Based Filetering

#### In this tutorial, we will implement the most native approaches for computing Content-based Recommendation

In [3]:
import pandas as pd
import numpy as np
from sklearn import datasets
import sklearn
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [17]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
import math

class ContentBasedFiltering:
    def __init__(self,n_users,n_items,data_dimension,utility_train):
        self.n_users = n_users # number of users
        self.n_items = n_items # number of items
        self.dimension = data_dimension # vector dimension
        self.u_train = utility_train# utility dataframe 
        
    def get_items_by_user(self,rate_matrix, user_id):
        """
        # rate matrix: utility matrix
        # get list of items rated by this user_id and corresponding rating
        in each line of u_train, we have infor: user_id, item_id, rating (scores), time_stamp
        we care about the first three values
        return (item_ids, scores) rated by user user_id
        """
        ## note: user_id starts from 1
        y = rate_matrix[:,0] # all users
        # item indices rated by user_id
        ids = np.where(y == user_id)[0] 
        item_ids = rate_matrix[ids, 1]  # index starts from 0 
        scores = rate_matrix[ids, 2]
        return (item_ids, scores)
    
    def fit(self,X_features):
        ## fit the model to the dataset
        ## X_features: item profiles matrix (maybe via tfidf)
        ### infer the user profiles via fitting ridge model for each user
        d = self.dimension # data dimension
        n_users = self.n_users
        W = np.zeros((d, n_users))
        b = np.zeros((1, n_users))
        for n in range(1,n_users):    
            # get items and ratings for each user
            ids, scores = self.get_items_by_user(self.u_train.values, n)
            ids = ids -1
            # build a Ridge model for each user
            clf = Ridge(alpha=0.01, fit_intercept  = True)
            ### features of all items user rated
            Xhat = X_features[ids, :]
            ### fitting
            clf.fit(Xhat, scores) 
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_
        self.W = W
        self.b = b
        self.Yhat = X_features.dot(self.W) + self.b
        print(self.Yhat.shape)
        self.tfidf = X_features
        print("Fitted the model to data successfully. Found W and b")
        
    
    def predict(self,uid,items,dataset):
        ### recommend top 10 suitable product for user
        ## items: dataframe to get movie name
        ## dataset: the dataset to perform recommendation on for this user, 
        ### e.g: in this dataset, user gt is : ... and predicted value for these items :
        
        ids, scores_truth = self.get_items_by_user(dataset.values, uid)
        ## inside this dataset user voted for ids movie
        ## build a prediction for uid
        print(ids)
        new_ids = ids-1
        scores_preds = self.Yhat[new_ids,uid]
        print(scores_preds)
        idx = np.argsort(scores_preds)[-5:] #idx in the movie id list
        idx = ids[idx]
        print(idx)
        title = items.values[idx-1,1]
        print("recommend these content to this user id", title)
        
    def compare(self,uid,dataset):
        ids, scores = self.get_items_by_user(dataset.values, uid)
        ids = ids
        ## scores are gt scores in dataset
        print(ids)
        ids = ids-1
        print(scores)
        preds = self.Yhat[ids,uid]
        print(preds)
        
    def evaluate(self, dataset):
        se = 0
        cnt = 0
        se_2 = 0 # for average method
        cnt_2 = 0 # for average method
        for n in range(1,self.n_users):
            ids, scores_truth = self.get_items_by_user(dataset, n)
            scores_pred = self.Yhat[ids, n]
            e = scores_truth - scores_pred 
            se += (e*e).sum(axis = 0)
            cnt += e.size 
            ## What if we want to implement user profiles as the average of all rated item profiles?
            candidate_items = self.tfidf[ids,:]
            scores_mean_pred = np.mean(candidate_items)
            e_2 = scores_truth - scores_mean_pred 
            se_2 += (e_2*e_2).sum(axis = 0)
            cnt_2 += e_2.size 
        return math.sqrt(se/cnt),math.sqrt(se_2/cnt_2)
    
    

#### 1. The dataset

MovieLens 100K movie ratings. Stable benchmark dataset. 100,000 ratings from 1000 users on 1700 movies. Released 4/1998.

README.txt


ml-100k.zip (size: 5 MB, checksum)


Index of unzipped files


Permalink: https://grouplens.org/datasets/movielens/100k/

In [None]:
# download the dataset
import urllib.request
urllib.request.urlretrieve("https://files.grouplens.org/datasets/movielens/ml-100k.zip", "ml-100k.zip")
# unzip the file to directory "data"
import zipfile
with zipfile.ZipFile("ml-100k.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

In [4]:
# important paths
PATH_TO_DATA = "data/ml-100k/"
SUBPATH_ITEM = "u.item"
SUBPATH_TRAINING = "ua.base" # utility matrix for training
SUBPATH_TEST = "ua.test" # utility matrix for training

#### 2. Insights of Data and Utility Matrix

In [5]:
# read the utility matrix:
cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
u_train = pd.read_csv(PATH_TO_DATA+SUBPATH_TRAINING, sep='\t', names=cols)
u_test = pd.read_csv(PATH_TO_DATA+SUBPATH_TEST, sep='\t',names=cols)

In [None]:
nb_train = u_train.shape[0]
nb_test = u_test.shape [0]

## basic stats: print number of train samples, of test samples



## rating stats: how many unique rating values and 
### plot a pie chart of rating value



## user stats: how many unsers, 
### on avarage each user vote how many times, 
### print top 10 user who voted the most


count_group = u_train.groupby("user_id").rating.count()

print("number of unique users: ",len(u_train.user_id.unique()))
print("on average each user voted X number of items, X=",np.mean(count_group))
count_group = count_group.reset_index(name='count').sort_values(['count'], ascending=False).head(10)
print("top 10 user id who voted the most: ")
print(count_group)

In [12]:
utility_train = u_train.pivot(index='movie_id', columns='user_id', values='rating')
print("plotting first 10 user's rating for first 10 movies")
print(utility_train.iloc[:10,:10])
nb_nans = (np.count_nonzero(np.isnan(utility_train.iloc[:10,:10])))
## calculate spasity as % of nans elements
print("current sparsity = ",nb_nans/100)
## calculate sparsity for the whole matrix
nb_nans = (np.count_nonzero(np.isnan(utility_train)))
print("Training set sparsity = ",nb_nans/(utility_train.shape[0]*utility_train.shape[1]))



plotting first 10 user's rating for first 10 movies
user_id    1    2   3   4   5    6    7    8    9    10
movie_id                                               
1         5.0  4.0 NaN NaN NaN  4.0  NaN  NaN  NaN  4.0
2         3.0  NaN NaN NaN NaN  NaN  NaN  NaN  NaN  NaN
3         4.0  NaN NaN NaN NaN  NaN  NaN  NaN  NaN  NaN
4         3.0  NaN NaN NaN NaN  NaN  5.0  NaN  NaN  4.0
5         3.0  NaN NaN NaN NaN  NaN  NaN  NaN  NaN  NaN
6         5.0  NaN NaN NaN NaN  NaN  NaN  NaN  NaN  NaN
7         4.0  NaN NaN NaN NaN  2.0  5.0  3.0  4.0  NaN
8         1.0  NaN NaN NaN NaN  4.0  5.0  NaN  NaN  NaN
9         5.0  NaN NaN NaN NaN  4.0  5.0  NaN  NaN  4.0
10        3.0  2.0 NaN NaN NaN  NaN  4.0  NaN  NaN  NaN
current sparsity =  0.74
Training set sparsity =  0.9428306317224663


#### 3. Preparing the Data

In [11]:
## define the features of items
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv(PATH_TO_DATA+SUBPATH_ITEM, sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]
print('Number of items:', n_items)

## calculate item profiles are based on last 19 columns

## building features using TF-IDF



Number of items: 1682


#### 4. Training and Prediction

In [16]:
### config parameter goes here

In [17]:

## creating a model
content_based = ContentBasedFiltering(n_users,n_items,d,u_train)

In [18]:
## fiting the model

Fitted the model to data successfully. Found W and b


In [19]:
### evaluate the model

(1.3665692836571879, 3.6903116575007284)

In [20]:
### compare the rating for a user on the test set

[ 20  33  61 117 155 160 171 189 202 265]
[4 4 4 3 2 4 5 3 5 4]
[3.50180262 2.4780471  3.86366841 2.78285588 4.16874794 3.84481683
 4.48294226 3.76639494 2.68684402 4.57244238]


In [21]:
### recommend items for a user

[ 20  33  61 117 155 160 171 189 202 265]
[3.50180262 2.4780471  3.86366841 2.78285588 4.16874794 3.84481683
 4.48294226 3.76639494 2.68684402 4.57244238]
[160  61 155 171 265]
recommend these content to this user id ['Glengarry Glen Ross (1992)' 'Three Colors: White (1994)'
 'Dirty Dancing (1987)' 'Delicatessen (1991)'
 'Hunt for Red October, The (1990)']


In [15]:
### top items by rating of user uid
uid = 1
top_rate_by_user = u_test[u_test.user_id==uid].sort_values(by=['rating'],ascending=False).reset_index()
print(top_rate_by_user.head(5))

   index  user_id  movie_id  rating  unix_timestamp
0      6        1       171       5       889751711
1      8        1       202       5       875072442
2      0        1        20       4       887431883
3      1        1        33       4       878542699
4      2        1        61       4       878542420


In [16]:
### TITLE of top items by rating of user uid

items.values[top_rate_by_user.movie_id-1,1][:5]

array(['Delicatessen (1991)', 'Groundhog Day (1993)',
       'Angels and Insects (1995)', 'Desperado (1995)',
       'Three Colors: White (1994)'], dtype=object)