In [1]:
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
!mv ml-1m/ data/

--2021-06-23 20:00:15--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2021-06-23 20:00:17 (3.72 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [2]:
import os
import gc

import pandas as pd
import numpy as np

from scipy.linalg import sqrtm
from sklearn.metrics import mean_squared_error as mse

from tqdm import tqdm

In [3]:
ratings_list = [i.strip().split("::") for i in open('data/ratings.dat', 'r').readlines()]
data = pd.DataFrame(ratings_list, columns = ['user_id', 'movie_id', 'rating', 'timestamp'], dtype = int)
data['timestamp'] = data['timestamp'].apply(int)
data['rating'] = data['rating'].apply(float)
del ratings_list
gc.collect()

0

In [4]:
data.shape

(1000209, 4)

In [5]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968
3,1,3408,4.0,978300275
4,1,2355,5.0,978824291


In [6]:
print('Unique users:', data['user_id'].nunique())
print('Unique movies:', data['movie_id'].nunique())

Unique users: 6040
Unique movies: 3706


In [7]:
def split_df(data, test_size=0.2):
    data['rank_latest'] = data.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)

    data = data.merge(data.groupby(['user_id'])['timestamp'].count(), on='user_id')
    data.columns = ['user_id', 'movie_id', 'rating', 'timestamp', 'rank_latest', 'review_count']

    data['test_samples_count'] = data['review_count'] - np.floor(data['review_count'] * test_size)

    train_ratings = data[data['rank_latest'] <= data['test_samples_count']]
    test_ratings = data[data['rank_latest'] > data['test_samples_count']]

    train_ratings = train_ratings[['user_id', 'movie_id', 'rating']]
    test_ratings = test_ratings[['user_id', 'movie_id', 'rating']]
    return train_ratings, test_ratings

In [8]:
train, test = split_df(data, 0.15)

In [9]:
def create_utility_matrix(data):
    userList = data.iloc[:, 0].tolist()
    itemList = data.iloc[:, 1].tolist()
    valueList = data.iloc[:, 2].tolist()

    users = data['user_id'].unique()
    items = data['movie_id'].unique()

    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

    for i in range(0,len(data)):
        pd_dict[itemList[i]][users_index[userList[i]]] = valueList[i]
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    return X, users_index, items_index

In [10]:
def svd(train, k):
    util_mat = np.array(train)
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)

    item_means = np.mean(masked_arr, axis=0)
    util_mat = masked_arr.filled(item_means)
    x = np.tile(item_means, (util_mat.shape[0],1))
    util_mat = util_mat - x

    U, s, V = np.linalg.svd(util_mat, full_matrices=False)
    s = np.diag(s)

    s = s[0:k,0:k]
    U = U[:,0:k]
    V = V[0:k,:]

    s_root = sqrtm(s)
    Usk = np.dot(U,s_root)
    skV = np.dot(s_root,V)

    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    return UsV

In [11]:
no_of_features = [8, 10, 12, 14, 17]
util_mat, users_index, items_index = create_utility_matrix(train)
for f in tqdm(no_of_features): 
    svdout = svd(util_mat, k=f)
    pred = []
    for _, row in test.iterrows():
        user = row['user_id']
        item = row['movie_id']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)

100%|██████████| 5/5 [06:35<00:00, 79.04s/it]


In [12]:
np.sqrt(mse(test.rating, pred))

0.9328478752246858