# User-based CF
* Dataset: MovieLens 100K Dataset
* Set NaN as 0
* 10% as testing data

## performance
* MSE_error= 6.941
* If ignoring the NaN cases, User-based CF can get a better mse (~1.9)
* I set NaN as 0 here so this can be comapred with model-based CF (matrix factorization)

In [None]:
import numpy as np
import pandas as pd
from numba import jit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# Read from csv
rating = pd.read_csv('../input/movielens-dataset/ratings.csv')

In [None]:
# Split into train and test
train, test = train_test_split(rating[['userId', 'movieId', 'rating']], test_size=0.1, random_state=42)

In [None]:
# Calculate similarity between users
rating_new = train.pivot(index='userId', columns='movieId', values='rating')
uniqueId = rating_new.index.to_numpy()
similarity = np.zeros((max(uniqueId), max(uniqueId)))
rating_zero = rating_new.fillna(0)

@jit(nopython=True, parallel=True)
def cal_similarity(uniqueId, rating_zero, similarity):
    for uid1 in uniqueId:
        for uid2 in uniqueId:
            if uid1 == uid2: continue
            arr1 = rating_zero[uid1-1]
            arr2 = rating_zero[uid2-1]
            a = (arr1 * arr2).sum()
            b = np.sqrt((arr1 ** 2).sum()) * np.sqrt((arr2 ** 2).sum())
            similarity[uid1-1][uid2-1] = a/b
    
cal_similarity(uniqueId, rating_zero.to_numpy(), similarity)

In [None]:
# Calculate average rating for each user
r = rating[['userId', 'rating']].groupby('userId').mean()

In [None]:
# Predict
neighborhood_size = 2
useless_data_idx = []
y_true = test['rating'].to_list()
y_pred = []
# ct = 0
for _, t in test.iterrows():
    valid_uid = int(t['userId'])
    valid_movie = int(t['movieId'])
    neibor_sim = np.sort(similarity[valid_uid-1])[-neighborhood_size:]
    neibor_index = np.argsort(similarity[valid_uid-1])[-neighborhood_size:]
    a = 0
    b = 0
#     # Ignore Nan cases. This will get a better mse (~1.9)
#     try:
#         for ni, ns in zip(neibor_index, neibor_sim):
#             a += ns * (rating_new.fillna(0).loc[ni+1].loc[valid_movie] - r.loc[ni+1])
#             b += ns
#         cur_rating = (r.loc[valid_uid] + a / b).values
#         if not np.isnan(cur_rating):
#             y_pred.append(cur_rating)
#         else:
#             useless_data_idx.append(ct)
#     except:
#         useless_data_idx.append(ct)
    for ni, ns in zip(neibor_index, neibor_sim):
        try:
            rvi = rating_zero.loc[ni+1].loc[valid_movie]
        except:
            rvi = 0
        a += ns * (rvi - r.loc[ni+1])
        b += ns
    cur_rating = (r.loc[valid_uid] + a / b).values
    if cur_rating > 0:
        y_pred.append(cur_rating)
    else:
        y_pred.append(0)

# y_true = [i for j, i in enumerate(y_true) if j not in useless_data_idx]

print("test_mse_error=", mean_squared_error(y_true, y_pred))