In [2]:
import os, time, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# Define path of dataset 100K
data_dir1 = "./Data/ml-100k"
ratings_train_path1 = os.path.join(data_dir1, "ua.base")
ratings_test_path1 = os.path.join(data_dir1, "ua.test")
items_file_path1 = os.path.join(data_dir1, "u.item")
users_file_path1 = os.path.join(data_dir1, "u.user")

In [4]:
# Read users file
u_cols1 =  ['userId', 'age', 'sex', 'occupation', 'zip_code']
users1 = pd.read_csv(users_file_path1, sep='|', names=u_cols1, encoding='latin-1')

n_users1 = users1.shape[0]
print("Number of users:", n_users1)

Number of users: 943


In [5]:
# Read ratings file
r_cols1 = ['userId', 'movieId', 'rating', 'unix_timestamp']

ratings_train1 = pd.read_csv(ratings_train_path1, sep='\t', names=r_cols1, encoding='latin-1')
ratings_test1 = pd.read_csv(ratings_test_path1, sep='\t', names=r_cols1, encoding='latin-1')

print("Training data size : ", ratings_train1.shape[0])
print("Test data size     : ", ratings_test1.shape[0])

Training data size :  90570
Test data size     :  9430


In [6]:
#Reading items file:
i_cols1 = ['movieId', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items1 = pd.read_csv(items_file_path1, sep='|', names=i_cols1, encoding='latin-1')

n_items1 = items1.shape[0]
print("Number of items:", n_items1)

Number of items: 1682


In [3]:
def convert_genres_names(genres_names):
    genres_names = genres_names.replace("(no genres listed)", "Unknown")
    genres_names = genres_names.replace('|', ' ')
    genres_names = genres_names.replace('-', '')
    return genres_names

In [4]:
# Define path of dataset 1M
data_dir2 = "./Data/ml-1m"
ratings_file_path2 = os.path.join(data_dir2, "ratings.dat")
items_file_path2 = os.path.join(data_dir2, "movies.dat")
# ratings_path2 = os.path.join(data_dir2, "ratings.dat")
ratings_train_path2 = os.path.join(data_dir2, "ratings_train.csv")
ratings_test_path2 = os.path.join(data_dir2, "ratings_test.csv")

In [5]:
# Reading items file:
i_cols2 = ["movieId", "title", "genres"]
items2 = pd.read_csv(items_file_path2, encoding="latin-1", sep="::", names=i_cols2)

num_items2 = items2.shape[0]
print("Number of items (movies) : ", num_items2)
items2["genres"] = items2["genres"].apply(convert_genres_names)
# print(items.info())
print("Example items:")
items2.head(3)

Number of items (movies) :  3883
Example items:


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance


In [6]:
ratings_train2 = pd.read_csv(ratings_train_path2, encoding="utf-8")
training_size2 = ratings_train2.shape[0]
print("Training data size : ", training_size2)
ratings_train2.head()

Training data size :  800167


Unnamed: 0,userId,movieId,rating,unix_timestamp
0,1926,34,4,974692704
1,3173,1073,5,968791878
2,5175,2762,5,961861063
3,4835,3754,2,962895467
4,4626,2071,5,964114511


In [7]:
ratings_test2 = pd.read_csv(ratings_test_path2, encoding="utf-8")
test_size2 = ratings_test2.shape[0]
print("Test data size : ", test_size2)
ratings_test2.head()

Test data size :  200042


Unnamed: 0,userId,movieId,rating,unix_timestamp
0,5972,593,5,956952291
1,5952,2401,4,957145342
2,4933,1805,2,1011684938
3,2181,587,2,975634724
4,2513,1641,5,974072036


In [7]:
class ContentBased():
    def __init__(self, items, ratings, dataset_name, sim_func=cosine_similarity):
        self.items = items
        self.ratings = ratings
        self.dataset_name = dataset_name
        self.sim_func = sim_func
        self.map_items_sim = {}
        self.algo_name = "Content Based"
    
    def fit(self):
        # Build features matrix
        if self.dataset_name == "ml-100k":
            self.sparse_sim_matrix = sparse.csr_matrix(self.items.iloc[:, 5:].values)
            self.sparse_sim_matrix = cosine_similarity(self.sparse_sim_matrix, self.sparse_sim_matrix)
            
        elif self.dataset_name == "ml-1m":
            count_vectorizer = CountVectorizer()
            self.sparse_sim_matrix = count_vectorizer.fit_transform(self.items.genres.values)
            self.sparse_sim_matrix = cosine_similarity(self.sparse_sim_matrix, self.sparse_sim_matrix)
            print("Vocabulary : ", count_vectorizer.vocabulary_)
            print("Number genres : ", self.sparse_sim_matrix.shape[1])
            
#             # Build map
            self.map_item_id_to_idx = {row.movieId: index for index, row in self.items.iterrows()}
        
        print("Shape of sparse features matrix : ", self.sparse_sim_matrix.shape)
        print("{} fit done".format(self.algo_name))
        
    def calc_similarity(self, item_id1, item_id2):
        if self.dataset_name == "ml-100k":
            item_id1 -= 1
            item_id2 -= 1
            sim = self.sparse_sim_matrix[item_id1, item_id2]
            
        elif self.dataset_name == "ml-1m":
                
#             if item_id1 > item_id2:
#                 item_id1, item_id2 = item_id2, item_id1

#             sim = self.map_items_sim.get((item_id1, item_id2))
#             if sim is None:
#                 features1 = self.sparse_sim_matrix[self.map_item_id_to_idx.get(item_id1)].reshape(1, -1)    # Reshape to 2D array
#                 features2 = self.sparse_sim_matrix[self.map_item_id_to_idx.get(item_id2)].reshape(1, -1)
#                 sim = self.sim_func(features1, features2)
#                 self.map_items_sim.update({(item_id1, item_id2): sim})
            sim = self.sparse_sim_matrix[self.map_item_id_to_idx.get(item_id1), self.map_item_id_to_idx.get(item_id2)]

        return sim
    
    def get_rated_items(self, user_id):
        filtered_records = self.ratings[self.ratings["userId"] == user_id]
        item_ids = filtered_records["movieId"].values        # ndarray type
        ratings = filtered_records["rating"].values
        return item_ids, ratings
    
    # Predict rating of given user_id for item_id
    def predict_rating(self, user_id, item_id, pct_in_batch=None):
        
        rated_item_ids, ratings = self.get_rated_items(user_id)
        similarity = np.empty_like(ratings, dtype=np.float32)
        for i in range(len(rated_item_ids)):
            similarity[i] = self.calc_similarity(item_id, rated_item_ids[i])
        pred_rating = np.sum(ratings * (similarity / (similarity.sum() + 1e-16)))
        if pct_in_batch is None:
            print("UserID : {:8d}, MovieID : {:8d}, Number users rated movies : {:5d}, Predict rating : {:.4f}".format(user_id, item_id, len(rated_item_ids), pred_rating))
        else:
            print("{:6.2f}%  UserID : {:8d}, MovieID : {:8d}, Number users rated movies : {:5d}, Predict rating : {:.4f}".format(pct_in_batch, user_id, item_id, len(rated_item_ids), pred_rating))

        return pred_rating

    def predict_rating_batch(self, ids):
        # ids is list of (user_id, movie_id) tuple
        batch_size = len(ids)
        print("Predicting {} ratings ...".format(batch_size))
        pred_ratings = []
        for i in range(batch_size):
            user_id = ids[i][0]
            item_id = ids[i][1]
            pct_in_batch = (i + 1) / batch_size * 100
            pred_ratings.append(self.predict_rating(user_id, item_id, pct_in_batch))
        return np.array(pred_ratings)

    def get_rmse(self, true_ratings, pred_ratings):
        return math.sqrt(mean_squared_error(true_ratings, pred_ratings))

    def evaluate_model(self, test_data):
        start_time = time.time()

        ids = list(zip(list(test_data.userId.values), list(test_data.movieId.values)))
        pred_ratings = self.predict_rating_batch(ids)
        predicted_data = test_data.copy()
        predicted_data["Predict rating"] = pred_ratings
        true_ratings = test_data.rating.values
        rmse_error = self.get_rmse(true_ratings, pred_ratings)

        finish_time = time.time()
        exec_time = finish_time - start_time

        print("\nSize of evaluate data : ", test_data.shape[0])
        print("Time : {} seconds".format(exec_time))
        print("RMSE : ", rmse_error)

        return predicted_data, rmse_error, exec_time

In [9]:
rs1 = ContentBased(items1, ratings_train1, "ml-100k")
rs1.fit()

Shape of sparse features matrix :  (1682, 1682)
Content Based fit done


In [None]:
predicted_data1, rmse_test_error1, exec_time1 = rs1.evaluate_model(ratings_test1)

In [11]:
print("Execute time : {:.4f} seconds".format(exec_time1))
print("RMSE on test data : {:.6f}".format(rmse_test_error1))
predicted_data1.head(10)

Execute time : 22.3253 seconds
RMSE on test data : 1.096987


Unnamed: 0,userId,movieId,rating,unix_timestamp,Predict rating
0,1,20,4,887431883,3.954298
1,1,33,4,878542699,3.553789
2,1,61,4,878542420,3.975465
3,1,117,3,874965739,3.307714
4,1,155,2,878542201,3.672359
5,1,160,4,875072547,3.975465
6,1,171,5,889751711,3.621108
7,1,189,3,888732928,3.47609
8,1,202,5,875072442,3.592361
9,1,265,4,878542441,3.438173


In [12]:
# Save predict rating
output_path1 = "./Output/Predict_Data-{}_Model-{}.csv".format(rs1.dataset_name, rs1.algo_name).replace(" ", "")
predicted_data1.to_csv(output_path1, index=False)

In [9]:
rs2 = ContentBased(items2, ratings_train2, "ml-1m")
rs2.fit()

Vocabulary :  {'horror': 10, 'musical': 11, 'romance': 13, 'scifi': 14, 'comedy': 4, 'fantasy': 8, 'mystery': 12, 'war': 16, 'documentary': 6, 'crime': 5, 'adventure': 1, 'filmnoir': 9, 'action': 0, 'animation': 2, 'western': 17, 'drama': 7, 'thriller': 15, 'children': 3}
Number genres :  3883
Shape of sparse features matrix :  (3883, 3883)
Content Based fit done


In [None]:
predicted_data2, rmse_test_error2, exec_time2 = rs2.evaluate_model(ratings_test2[:1000])

In [13]:
print("Execute time : {:.4f} seconds".format(exec_time2))
print("RMSE on test data : {:.6f}".format(rmse_test_error2))
predicted_data2.head(10)

Execute time : 5.5095 seconds
RMSE on test data : 1.024142


Unnamed: 0,userId,movieId,rating,unix_timestamp,Predict rating
0,5972,593,5,956952291,3.73764
1,5952,2401,4,957145342,4.287001
2,4933,1805,2,1011684938,3.625283
3,2181,587,2,975634724,3.384232
4,2513,1641,5,974072036,4.066898
5,2146,2028,5,974623190,3.353536
6,5111,1363,2,962335163,3.6471
7,801,1208,5,975400546,3.846577
8,929,2028,5,975190707,3.719731
9,3152,1073,3,1019015734,3.138401
