In [132]:
import os 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import logging
logging.basicConfig(level="INFO")

sys.path.append(os.path.abspath(os.curdir))
data_dir = os.path.join(os.path.abspath(os.curdir), "ml-latest-small")

In [3]:
# read data
link_file = "links.csv"
link = pd.read_csv(os.path.join(data_dir, link_file))
display(link.head(3))

movie_file = "movies.csv"
movie = pd.read_csv(os.path.join(data_dir, movie_file))
movie["genres"] = movie["genres"].apply(lambda x: x.split("|"))
display(movie.head(3))

rating_file = "ratings.csv"
rating = pd.read_csv(os.path.join(data_dir, rating_file))
display(rating.head(3))

tag_file = "tags.csv"
tag = pd.read_csv(os.path.join(data_dir, tag_file))
display(tag.head(3))


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


## Collaborative filtering

Collaborative filtering uses similarities between users and items simultaneously to provide recommendations. This allows for serendipitous recommendations. The collaborative filtering models can recommend an item to user A based on the interest of a similar user B. The embedding can be learned automatically, without relying on hand-engineering of features. 

https://developers.google.com/machine-learning/recommendation/collaborative/basics

In [112]:
from surprise import Trainset
from surprise import Dataset
from surprise.reader import Reader
from surprise.similarities import pearson
from surprise.accuracy import rmse, mae
from sklearn.model_selection import train_test_split
from surprise.model_selection import split

In [74]:
# create training testing set for use-based ratings
def train_test_time(rating_df, col = "userId", ratio = 0.8):
    """
    This function is used for creating training and testing set from the rating matrix
    For each user, a 80-20 split is performed
    The split is based on the time stamps, used for testing 
    80% old ratings are used for training
    """

    train_rating, test_rating = pd.DataFrame(columns=rating_df.columns), pd.DataFrame(columns=rating_df.columns)

    for _, df in rating_df.copy().groupby(col):
        
        df = df.sort_values(by = "timestamp")
        if df.shape[0] >= 5:
            train_shape = int(ratio * df.shape[0])
            train_rating = train_rating.append(df.iloc[:train_shape,:], ignore_index=True)
            test_rating = test_rating.append(df.iloc[train_shape:,:], ignore_index=True)
        else:
            train_rating = train_rating.append(df, ignore_index=True)
            
    train_rating.reset_index(drop=True, inplace=True)
    test_rating.reset_index(drop=True, inplace=True)

    return train_rating, test_rating

def train_test_random(rating_df, col = "userId", ratio = 0.8):
    """
    This function is used for creating training and testing set from the rating matrix
    For each user, a 80-20 split is performed
    The split is based on the random split, used for cross-validation
    """
    train_rating, test_rating = pd.DataFrame(columns=rating_df.columns), pd.DataFrame(columns=rating_df.columns)

    for _, df in rating_df.copy().groupby(col):
        train, test = train_test_split(df, train_size = ratio)
        train_rating = train_rating.append(train)
        test_rating = test_rating.append(test)
    train_rating.reset_index(drop=True, inplace=True)
    test_rating.reset_index(drop=True, inplace=True)
    return train_rating, test_rating




In [84]:
# crate surprise data 
train, test = train_test_time(rating)
trainset = Dataset.load_from_df(train.copy()[["userId", "movieId", "rating"]], reader = Reader(line_format = "user item rating"))
trainset = trainset.build_full_trainset()
testset = Dataset.load_from_df(test.copy()[["userId", "movieId", "rating"]], reader = Reader(line_format = "user item rating"))
testset = testset.build_full_trainset()

## KNN-Based Collaborative Filtering with Pearson Correlation 

For user $u$ and $v$, the pearson's correlation is calcualted as 
$$ \rho(u, v) = \frac{\sum_{k \in I_u \land I_v} (r_{ku} - \mu_u)(r_{kv} - \mu_v)}{\sqrt{\sum_{k \in I_u \land I_v} (r_{ku} - \mu_u)^2} \sqrt{\sum_{k \in I_u \land I_v} (r_{kv} - \mu_v^2}} $$
The closest neighbors are queried by the Peason's correlations, and the unknown rating of user $u$ on j $\hat r_{uj}$ is calculated as 

$$\hat r_{uj} = \mu_u + \frac{\sum_{v\in P_u} \rho(u,v)(r_{vj} - \mu_v)}{\sum_{v\in P_u} \rho(u,v)}$$

The second half of the equation is take the weighted average of the mean-centered ratings, and weights are similarities, which are Pearson's correlations herein

In [7]:
from surprise.prediction_algorithms import knns

In [95]:
sim_options = {"name": "pearson_baseline",
"user_based": True,
"shrinkage": 0}

# base knn model
base_knn = knns.KNNBaseline(sim_options=sim_options)
# train_score
base_knn.fit(trainset)
train_pred = base_knn.test(trainset.build_testset())
logging.info("Mean squared error is {}, mean absolute error is {}".format(rmse(train_pred), mae(train_pred)))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.4380
MAE:  0.3078


In [133]:
# perform cross-validatin on the number of neighbors 
cross_val_set = Dataset.load_from_df(train.copy()[["userId", "movieId", "rating"]], reader = Reader(line_format = "user item rating"))
mae_score, rmse_score = {}, {}
for n_neighbors in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    kfold = split.KFold(n_splits=5, shuffle=True)

    mae_hist, rmse_hist = [], []
    for tr, val in kfold.split(cross_val_set):
        model = knns.KNNBaseline(k = n_neighbors,sim_options=sim_options, verbose=1)
        model.fit(tr)
        val_pred = model.test(val)
        mae_hist.append(mae(val_pred, verbose=False))
        rmse_hist.append(rmse(val_pred, verbose=False))
    mae_score[n_neighbors] = np.mean(mae_hist)
    rmse_score[n_neighbors] = np.mean(rmse_hist)
    logging.info("KFold cross-validation for {}: rmse score= {}, mae score = {}".format(n_neighbors, 
    mae_score[n_neighbors], 
    rmse_score[n_neighbors]))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline si

## Matrix Factorization