In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

import os

### Get and explore data
Dataset - https://www.kaggle.com/grouplens/movielens-20m-dataset

In [None]:
DATA_PATH = "../data/100K"
df = pd.read_csv(os.path.join(DATA_PATH, 'rating.csv'))
df.head()

In [None]:
# Why do we need str here?
# df['userId'] = df['userId'].astype('str')
# df['movieId'] = df['movieId'].astype('str')
#list of all users and movies
users = df['userId'].unique() 
movies = df['movieId'].unique() 
print("Number of users", len(users))
print("Number of movies", len(movies))
print(df.head())

In [None]:
axes = sns.countplot(x=df['rating'], data=df, palette="viridis")

In [None]:
# Find mean ratings for movies:
mean_rating = df['rating'].mean()
mean_rating

### Train and Test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 

In [None]:
train = pd.DataFrame(columns=df.columns)
test = pd.DataFrame(columns=df.columns)

#20 percent of the dataset will be allocated to the test set and 80 percent will be allocated to the training set
test_per = 0.2 

for x in users:
    temp = df[df['userId'] == x]
    n = len(temp)
    test_size = int(test_per*n)

    #take latest data about rating for a test (not a random split)
    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)
        
    dummy_train = temp.loc[: n-2-test_size]
    dummy_test = temp.loc[n-1-test_size :]

    train = pd.concat([train, dummy_train])    
    test = pd.concat([test, dummy_test])


### Create Utility Matrix 2.0

The create_X() function outputs a sparse matrix X with four mapper dictionaries:

user_mapper: maps user id to user index
item_mapper: maps movie id to movie index
user_inv_mapper: maps user index to user id
item_inv_mapper: maps movie index to movie id
We need these dictionaries because they map which row/column of the utility matrix corresponds to which user/movie id.

Our X (user-item) matrix is a scipy.sparse.csr_matrix which stores the data sparsely.

In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
      
    num_users = len(df['userId'].unique())
    num_movies = len(df['movieId'].unique())
      
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(num_users))))
    item_mapper = dict(zip(np.unique(df["movieId"]), list(range(num_movies))))
      
    # Map indices to IDs
    user_inverse_mapper = dict(zip(list(range(num_users)), np.unique(df["userId"])))
    item_inverse_mapper = dict(zip(list(range(num_movies)), np.unique(df["movieId"])))
      
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [item_mapper[i] for i in df['movieId']]
  
    # print(df["rating"])
    X = csr_matrix((df["rating"].to_list(), (item_index, user_index)), shape=(num_movies, num_users))
    
    # print(X.shape)
      
    return X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper
  
utility_matrix, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper = create_X(train)

### Item-item Recommendations with k-Nearest Neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
def find_similar_movies(movie_id, X, item_mapper, item_inv_mapper, movie_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.

    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations

    Output: returns list of k similar movie ID's
    """
    X = X.T
    neighbour_ids = []

    movie_ind = item_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

neighbour_ids = find_similar_movies(movie_id=3, X=utility_matrix, item_mapper=item_mapper, item_inv_mapper=item_inverse_mapper, movie_inv_mapper=item_inverse_mapper, k=10)
print (neighbour_ids)

In [None]:
# TODO: implement top-k predictor
from sklearn.metrics import top_k_accuracy_score

### SVD Computation
decomposes a matrix into constituent arrays of feature vectors corresponding to each row and each column

In [None]:
from scipy.linalg import sqrtm

def svd(train, k):
    utilMat = train.toarray()
    
    # the nan or unavailable entries are masked
    print(utilMat.shape)
    print(utilMat.dtype)
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    item_average = np.tile(item_means, (utilMat.shape[0],1))
    
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    
    utilMat = utilMat - item_average
    
    # U and V are user and item features
    U, S, V=np.linalg.svd(utilMat, full_matrices=False)
    # S=np.diag(S)
    # We take only the k most significant features
    S=S[0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    # s_root=sqrtm(S)
    # Usk=np.dot(U,s_root)
    # skV=np.dot(s_root,V)
    # UsV = np.dot(Usk, skV)
    # UsV = UsV + x
    print('SVD done')
    return U, S, V, item_average

U, S, V, item_average = svd(utility_matrix, 20)

Utility matrix is a user-rating matrix where 

In [None]:
P, S, Q = scipy.sparse.linalg.svds(utility_matrix, k=15)

P - user-feature matrix. Dimension: number of users by number of features.
S - singular values
Q - movie-feature matrix. Dimension: number of features by number of movies.

Prediction rule
$$
\widetilde{r_{ui}} = \sum_{f}{p_{userfeature} \cdot \sigma_{feature} \cdot q_{itemfeature}}
$$

In [None]:
test.head(3)

In [None]:
from collections import defaultdict
item_mapper_default = defaultdict(lambda: -1, item_mapper)
user_mapper_default = defaultdict(lambda: -1, user_mapper)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


def predict_pair(user_id, item_id, P, S, Q):
    item = item_mapper_default[item_id]
    user = user_mapper_default[user_id]
    if item == -1 or user == -1:
        return None

    return (P[item, :].dot(Q[:, user]) * S).sum()

def predict(data, P, S, Q):
    return np.array([
        predict_pair(u_id, i_id, P, S, Q) # TODO: move  P, S, Q to internal data
        for u_id, i_id in zip(data['userId'], data['movieId'])])
        
pred = predict(test, P, S, Q)
pred[:20]

In [None]:
# Drop movies that are not present in train set - we cannot predict anything for them
test_rating = test['rating']
not_present_in_train = np.where(pred == None)[0]
test_rating.reset_index(drop=True, inplace=True)
valid_test_rating = test_rating.drop(not_present_in_train)
valid_prediction = pred[pred != None]

# Clip predictions
MIN_RATING = 0
MAX_RATING = 5
valid_prediction = np.clip(valid_prediction, MIN_RATING, MAX_RATING)
display(valid_prediction[:20])

mae = mean_absolute_error(valid_test_rating, valid_prediction)
rmse = mean_squared_error(valid_test_rating, valid_prediction)
print(f'Test MAE: {mae:.2f}')
print(f'Test RMSE: {mae:.2f}')