In [2]:
import os, time, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [3]:
# Define path of dataset 100K
data_dir1 = "./Data/ml-100k"
ratings_train_path1 = os.path.join(data_dir1, "ua.base")
ratings_test_path1 = os.path.join(data_dir1, "ua.test")
items_file_path1 = os.path.join(data_dir1, "u.item")
users_file_path1 = os.path.join(data_dir1, "u.user")

In [4]:
# Read users file
u_cols1 =  ['userId', 'age', 'sex', 'occupation', 'zip_code']
users1 = pd.read_csv(users_file_path1, sep='|', names=u_cols1, encoding='latin-1')

n_users1 = users1.shape[0]
print("Number of users:", n_users1)

Number of users: 943


In [5]:
# Read ratings file
r_cols1 = ['userId', 'movieId', 'rating', 'unix_timestamp']

ratings_train1 = pd.read_csv(ratings_train_path1, sep='\t', names=r_cols1, encoding='latin-1')
ratings_test1 = pd.read_csv(ratings_test_path1, sep='\t', names=r_cols1, encoding='latin-1')

print("Training data size : ", ratings_train1.shape[0])
print("Test data size     : ", ratings_test1.shape[0])

Training data size :  90570
Test data size     :  9430


In [6]:
#Reading items file:
i_cols1 = ['movieId', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items1 = pd.read_csv(items_file_path1, sep='|', names=i_cols1, encoding='latin-1')

n_items1 = items1.shape[0]
print("Number of items:", n_items1)

Number of items: 1682


In [4]:
def convert_genres_names(genres_names):
    genres_names = genres_names.replace("(no genres listed)", "Unknown")
    genres_names = genres_names.replace('|', ' ')
    genres_names = genres_names.replace('-', '')
    return genres_names

In [5]:
# Define path of dataset 1M
data_dir2 = "./Data/ml-1m"
ratings_file_path2 = os.path.join(data_dir2, "ratings.dat")
items_file_path2 = os.path.join(data_dir2, "movies.dat")
# ratings_path2 = os.path.join(data_dir2, "ratings.dat")
ratings_train_path2 = os.path.join(data_dir2, "ratings_train.csv")
ratings_test_path2 = os.path.join(data_dir2, "ratings_test.csv")

In [6]:
# Reading items file:
i_cols2 = ["movieId", "title", "genres"]
items2 = pd.read_csv(items_file_path2, encoding="latin-1", sep="::", names=i_cols2)

num_items2 = items2.shape[0]
print("Number of items (movies) : ", num_items2)
items2["genres"] = items2["genres"].apply(convert_genres_names)
# print(items.info())
print("Example items:")
items2.head(3)

Number of items (movies) :  3883
Example items:


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance


In [16]:
# # Read ratings file
# r_cols2 = ["userId", "movieId", "rating", "unix_timestamp"]
# ratings2 = pd.read_csv(ratings_file_path2, encoding="utf-8", sep="::", names=r_cols2)

# print("Info training data :"); print(ratings2.info())
# print("Example training data : \n")
# ratings2.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Info training data :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
userId            1000209 non-null int64
movieId           1000209 non-null int64
rating            1000209 non-null int64
unix_timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB
None
Example training data : 



Unnamed: 0,userId,movieId,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
# from sklearn.model_selection import train_test_split
# # Split train test
# ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=7)
# training_size = ratings_train.shape[0]
# test_size = ratings_test.shape[0]
# dataset_size = ratings.shape[0]
# print("Training data size : {}, pct = {:.2f}%%".format(training_size, training_size / dataset_size * 100))
# print("Test data size     : {}, pct = {:.2f}%%".format(test_size, test_size / dataset_size * 100))
# print("Dataset size       : {}".format(dataset_size))

  return f(*args, **kwds)


Training data size : 800167, pct = 80.00%%
Test data size     : 200042, pct = 20.00%%
Dataset size       : 1000209


In [18]:
# ratings_train.to_csv(ratings_train_path, index=False)
# ratings_test.to_csv(ratings_test_path, index=False)

In [7]:
ratings_train2 = pd.read_csv(ratings_train_path2, encoding="utf-8")
training_size2 = ratings_train2.shape[0]
print("Training data size : ", training_size2)
ratings_train2.head()

Training data size :  800167


Unnamed: 0,userId,movieId,rating,unix_timestamp
0,1926,34,4,974692704
1,3173,1073,5,968791878
2,5175,2762,5,961861063
3,4835,3754,2,962895467
4,4626,2071,5,964114511


In [8]:
ratings_test2 = pd.read_csv(ratings_test_path2, encoding="utf-8")
test_size2 = ratings_test2.shape[0]
print("Test data size : ", test_size2)
ratings_test2.head()

Test data size :  200042


Unnamed: 0,userId,movieId,rating,unix_timestamp
0,5972,593,5,956952291
1,5952,2401,4,957145342
2,4933,1805,2,1011684938
3,2181,587,2,975634724
4,2513,1641,5,974072036


In [25]:
# ratings_train_path = os.path.join(data_dir, "ratings_train.csv")
# ratings_test_path = os.path.join(data_dir, "ratings_test.csv")
# ratings_train.to_csv(ratings_train_path, encoding="utf-8", index=False)
# ratings_test.to_csv(ratings_test_path, encoding="utf-8", index=False)

In [15]:
# tags = pd.read_csv(tags_file_path, encoding="utf-8")
# num_tags = tags.shape[0]
# print("Num tags : ", num_tags)
# tags.head()

Num tags :  1128


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [16]:
# genome_scores = pd.read_csv(genome_scores_file_path, encoding="utf-8")
# print("Number records : ", genome_scores.shape[0])
# genome_scores.head()

Number records :  11709768


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [7]:
class CollaborativeFiltering():
    def __init__(self, ratings_train, dataset_name, k_nearest_neighbors = 10, sim_func=cosine_similarity, uuCF=True):
        self.uuCF = uuCF    # uuCF is True if user-user CF, is False if item-item
        self.ratings_train = ratings_train if uuCF else ratings_train[:, [1, 0, 2]]    # ndarray type
        self.dataset_name = dataset_name
        self.k_nearest_neighbors = k_nearest_neighbors
        self.sim_func = sim_func
        self.map_users_sim = {}
        self.algo_name = "User-User CF" if uuCF else "Item-Item CF"
        
    
    def build_map(self, ndarray):
        # Return map from old id to new id, vice versa and new ndarray contain new id
        unique_array = np.unique(ndarray)
        
        map_old_id_to_new = {}
        map_new_id_to_old = {}
        
        for i in range(unique_array.shape[0]):
            old_id = unique_array[i]
            map_old_id_to_new.update({old_id: i})
            map_new_id_to_old.update({i: old_id})
            
        new_ids = np.array([map_old_id_to_new.get(old_id) for old_id in ndarray])
        
        return new_ids, map_old_id_to_new, map_new_id_to_old
    
    def fit(self):
        # Build sparse similarity matrix
        
        self.ratings_train[:, 0], self.map_old_user_id_to_new, self.map_new_user_id_to_old = self.build_map(self.ratings_train[:, 0])
        self.ratings_train[:, 1], self.map_old_item_id_to_new, self.map_new_item_id_to_old = self.build_map(self.ratings_train[:, 1])
        
        self.num_users = len(self.map_old_user_id_to_new)
        self.num_items = len(self.map_old_item_id_to_new)
        print("Num users distinct : ", self.num_users)
        print("Num items distinct : ", self.num_items)

        self.sparse_sim_matrix = sparse.coo_matrix(
            (self.ratings_train[:, 2], (self.ratings_train[:, 0], self.ratings_train[:, 1])),
            (self.num_users, self.num_items))
        self.sparse_sim_matrix = self.sparse_sim_matrix.tocsr()
        print("{} fit done".format(self.algo_name))
    
    def calc_similarity(self, user_id1, user_id2):
        if user_id1 > user_id2:
            user_id1, user_id2 = user_id2, user_id1
        sim = self.map_users_sim.get((user_id1, user_id2))
        if sim is None: 
            features1 = self.sparse_sim_matrix[user_id1, :]
            features2 = self.sparse_sim_matrix[user_id2, :]
            sim = self.sim_func(features1, features2)
            self.map_users_sim.update({(user_id1, user_id2): sim})
        
        return sim
    
    def get_user_rated_item(self, item_id):
        filter_record_ids = np.where(self.ratings_train[:, 1] == item_id)
        if len(filter_record_ids) == 0:
            return None, None
        rated_users = self.ratings_train[filter_record_ids[0], 0]
        ratings = self.ratings_train[filter_record_ids[0], 2]
        return rated_users, ratings 
                                  
    # Predict rating of given user_id for movie_id
    def predict_rating(self, user_id, item_id, pct_in_batch=None):
        
        if self.uuCF is False:
            user_id, item_id = item_id, user_id
        
        # Convert to new id
        new_user_id = self.map_old_user_id_to_new.get(user_id)
        new_item_id = self.map_old_item_id_to_new.get(item_id)
        
#         print("Old user : {}, New user : {}, Old item : {}, New item : {}".format(user_id, new_user_id, item_id, new_item_id))
        
        # Check if user_id or item_id doesnt exist in training utility matrix
        if new_user_id is not None and new_item_id is not None:
            rated_users, ratings = self.get_user_rated_item(new_item_id)
            similarity = np.empty_like(ratings, dtype=np.float32)
            for i in range(len(similarity)):
                similarity[i] = self.calc_similarity(new_user_id, rated_users[i])
            pred_rating = np.sum(ratings * (similarity / (similarity.sum() + 1e-16)))
        else:
            rated_users = []
            if new_user_id is not None:
                # Pred_rating is mean of user's ratings in training data
                pred_rating = self.sparse_sim_matrix[new_user_id, :].mean()
            elif new_item_id is not None:
                # Pred_rating is mean of movie's ratings in training data
                pred_rating = self.sparse_sim_matrix[:, new_item_id].mean()
            else:
                # Pred_rating is mean of overall ratings in training data
                pred_rating = self.sparse_sim_matrix.mean()
        
        if pct_in_batch is None:
            print("UserID : {:8d}, MovieID : {:8d}, Number rated : {:5d}, Predict rating : {:.4f}".format(user_id, item_id, len(rated_users), pred_rating))
        else:
            print("{:6.2f}%  UserID : {:8d}, MovieID : {:8d}, Number rated : {:5d}, Predict rating : {:.4f}".format(pct_in_batch, user_id, item_id, len(rated_users), pred_rating))

        return pred_rating
    
    def predict_rating_batch(self, ids):
        # ids is list of (user_id, movie_id) tuple
        batch_size = len(ids)
        print("Predicting {} ratings ...".format(batch_size))
        pred_ratings = []
        for i in range(batch_size):
            user_id = ids[i][0]
            item_id = ids[i][1]
            pct_in_batch = (i + 1) / batch_size * 100
            pred_ratings.append(self.predict_rating(user_id, item_id, pct_in_batch))
        return np.array(pred_ratings)
    
    def get_rmse(self, true_ratings, pred_ratings):
        return math.sqrt(mean_squared_error(true_ratings, pred_ratings))
    
    def evaluate_model(self, test_data):
        start_time = time.time()

        ids = list(zip(list(test_data.userId.values), list(test_data.movieId.values)))
        pred_ratings = self.predict_rating_batch(ids)
        predicted_data = test_data.copy()
        predicted_data["Predict rating"] = pred_ratings
        true_ratings = test_data.rating.values
        rmse_error = self.get_rmse(true_ratings, pred_ratings)

        finish_time = time.time()
        exec_time = finish_time - start_time

        print("\nSize of evaluate data : ", test_data.shape[0])
        print("Time : {} seconds".format(exec_time))
        print("RMSE : ", rmse_error)

        return predicted_data, rmse_error, exec_time
    

In [8]:
rs1 = CollaborativeFiltering(ratings_train1.iloc[:, :3].values, "ml-100k", uuCF=False)
rs1.fit()

Num users distinct :  1680
Num items distinct :  943
Item-Item CF fit done


In [None]:
predicted_data1, rmse_test_error1, exec_time1 = rs1.evaluate_model(ratings_test1)

In [10]:
print("Execute time : {:.4f} seconds".format(exec_time1))
print("RMSE on test data : {:.6f}".format(rmse_test_error1))
predicted_data1.head(10)

Execute time : 574.3775 seconds
RMSE on test data : 1.020564


Unnamed: 0,userId,movieId,rating,unix_timestamp,Predict rating
0,1,20,4,887431883,3.92855
1,1,33,4,878542699,3.697746
2,1,61,4,878542420,3.932885
3,1,117,3,874965739,3.709511
4,1,155,2,878542201,3.642885
5,1,160,4,875072547,3.80023
6,1,171,5,889751711,3.906974
7,1,189,3,888732928,3.838289
8,1,202,5,875072442,3.743903
9,1,265,4,878542441,3.723265


In [11]:
# Save predict rating
output_path1 = "./Output/Predict_Data-{}_Model-{}.csv".format(rs1.dataset_name, rs1.algo_name).replace(" ", "")
predicted_data1.to_csv(output_path1, index=False)

In [10]:
# Init instance
rs2 = CollaborativeFiltering(ratings_train2.iloc[:, :3].values, "ml-1m", uuCF=False)
rs2.fit()

Num users distinct :  3674
Num items distinct :  6040
Item-Item CF fit done


In [23]:
rs2.sparse_sim_matrix.shape

(3674, 6040)

In [25]:
# ids = [(5972, 593), (5952, 2401), (4933, 1805)]
# pred_ratings = rs.predict_rating_batch(ids).tolist()
# print("==================================================================================================")
# for (user_id, movie_id), pred_rating in zip(ids, pred_ratings):
#     true_rating = ratings_test[(ratings_test.userId == user_id) & (ratings_test.movieId == movie_id)].rating.values[0]
#     print("UserID : {:8d}, MovieID : {:8d} True rating : {:.1f}, Predict rating : {:.4f}".format(user_id, movie_id, true_rating, pred_rating))

Predicting 3 ratings ...
 33.33%  UserID :      593, MovieID :     5972, Number rated :   350, Predict rating : 3.8119
 66.67%  UserID :     2401, MovieID :     5952, Number rated :   107, Predict rating : 4.2706
100.00%  UserID :     1805, MovieID :     4933, Number rated :   247, Predict rating : 3.6016
UserID :     5972, MovieID :      593 True rating : 5.0, Predict rating : 3.8119
UserID :     5952, MovieID :     2401 True rating : 4.0, Predict rating : 4.2706
UserID :     4933, MovieID :     1805 True rating : 2.0, Predict rating : 3.6016


In [None]:
predicted_data2, rmse_test_error2, exec_time2 = rs2.evaluate_model(ratings_test2.iloc[:1000])

In [11]:
print("Execute time : {:.4f} seconds".format(exec_time2))
print("RMSE on test data : {:.6f}".format(rmse_test_error2))
predicted_data2.head(10)

Execute time : 1051.3008 seconds
RMSE on test data : 1.009764


Unnamed: 0,userId,movieId,rating,unix_timestamp,Predict rating
0,5972,593,5,956952291,4.361448
1,5952,2401,4,957145342,3.749669
2,4933,1805,2,1011684938,3.410706
3,2181,587,2,975634724,3.431049
4,2513,1641,5,974072036,3.872057
5,2146,2028,5,974623190,4.380173
6,5111,1363,2,962335163,3.025216
7,801,1208,5,975400546,4.245515
8,929,2028,5,975190707,4.352185
9,3152,1073,3,1019015734,3.881589
