In [27]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Data

In [10]:
#Reading user file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./data/ml-100k/u.user', sep ='|', names=u_cols, encoding='latin-1')

n_users = users.shape[0]
print("Number of users:", n_users)
users.head() #uncomment this to see some few examples

Number of users: 943


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [11]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('./data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('./data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

print("Number of training rates:", rate_train.shape[0])
print("Number of test rates:", rate_test.shape[0])

Number of training rates: 90570
Number of test rates: 9430


### Build item profiles

In [12]:
# Reading items file:
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDB URL', 'unknown', 'Action', 'Adventure',
         'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('./data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]
print("Number of items:", n_items)

Number of items: 1682


In [35]:
# Vì ta đang dựa trên thể loại phim để xây dựng profile, ta sẽ chỉ quan tâm tới 19 giá trị nhị phân ở cuối mỗi hàng
X0 = items.as_matrix()
X_train_counts = X0[:, -19:]

In [26]:
# Xây dựng feature vector cho mỗi item
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [28]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1
    # while index in python starts from 0
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

### Find model for user

In [29]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

In [30]:
d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept = True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [31]:
# predicted scores
Yhat = tfidf.dot(W) + b

In [32]:
# Ví dụ user có id là 10
n = 10
np.set_printoptions(precision=2) # 2 digits after.
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids: ', ids)
print('True ratings:     ', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids:  [ 37 109 110 226 424 557 722 724 731 739]
True ratings:      [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]


### Evaluate model using Root Mean Squared Error (RMSE)

In [34]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for i in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
    return np.sqrt(se/cnt)

print("RMSE for training:", evaluate(Yhat, rate_train, W, b))
print("RMSE for test    :", evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.8676014628392172
RMSE for test    : 0.8386423938638297


## Thảo luận:
#### * Content-based Recommendation Systems là phương pháp đơn giản nhất trong các hệ thống Recommendation Systems. 
#### * Phương pháp này xây dựng mô hình cho mỗi user không phụ thuộc vào các users khác.
#### * Việc xây dựng mô hình cho mỗi users được coi là bài toán Regression hoặc Classification