## Matrix factorization based reccomendation systems

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import random

import os
import warnings

from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
warnings.filterwarnings('ignore')
%matplotlib inline

In [12]:
# To je za osebno testiranje baze
df = pd.read_csv("data/user_ratedmovies.dat", sep="\t")

df_unique_movieID = df.drop_duplicates(subset=["movieID"])
df_unique_movieID

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,72000,2571,4.0,20,6,2019,19,17,16
1,72000,4993,4.0,20,6,2019,19,17,16
2,72000,296,3.5,20,6,2019,19,17,16
3,72000,5952,4.0,20,6,2019,19,17,16
4,72000,480,3.0,20,6,2019,19,17,16
...,...,...,...,...,...,...,...,...,...
850668,71331,62796,3.0,19,11,2008,23,10,40
851719,71420,3585,4.0,22,11,2007,22,43,7
852677,71420,59065,1.5,30,10,2008,15,41,38
853205,71483,4421,2.5,2,9,2003,22,18,25


In [23]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=None):
        self.path = path
        self.start_date = pd.to_datetime(start_date, dayfirst=True) if start_date else None
        self.end_date = pd.to_datetime(end_date, dayfirst=True) if end_date else None
        self.min_ratings = min_ratings
        self.df = self.init_df()
        self.users_reviews = dict()
        self.movie_reviews = dict()
        self.users_reviews_list = dict()

    def init_df(self):
        df = pd.read_csv(self.path, sep="\t", encoding="latin1")
        df["date"] = pd.to_datetime( df["date_day"].astype(str) + "." + df["date_month"].astype(str) + "." + df["date_year"].astype(str), format="%d.%m.%Y", dayfirst=True)

        if self.start_date is not None:
            df = df[df["date"] >= self.start_date]

        if self.end_date is not None:
            df = df[df["date"] < self.end_date]

        df["ratings_count"] = df.groupby("movieID")["rating"].transform("count")

        if self.min_ratings is not None:
            df = df[df["ratings_count"] >= self.min_ratings]

        return df

    def nratings(self):
        return len(self.df)
    
    def nusers(self):
        return len(self.df["userID"].unique())
    
    def nitems(self):
        return len(self.df["movieID"].unique())
    
    def get_movie_ids(self):
        return self.df["movieID"].unique()
    
    def get_user_ids(self):
        return self.df["userID"].unique()
    
    def sum_ratings_for_movie(self, movieid):
        vs = sum(self.df[self.df["movieID"] == movieid]["rating"])
        return vs
    
    def nratings_for_movie(self, movieid):
        n = self.df[self.df["movieID"] == movieid].shape[0]
        return n
    
    def average_rating(self):
        return sum(self.df["rating"]) / self.nratings()
    
    def movieid_user_has_rated(self, userid):
        res = set(self.df[self.df["userID"] == userid]["movieID"])
        return res
    
    # za ViewsPredictor
    def len_ratings_for_movieid(self, movieid):
        res = len(self.df[self.df["movieID"] == movieid])
        return res
    
    # za STDPredictor
    def ratings_for_movieid(self, movieid):
        res = list(self.df[self.df["movieID"] == movieid]["rating"])
        return res
    
    def get_user_reviews(self, userID):
        if not self.users_reviews:
            self.init_user_reviews()

        return self.users_reviews.get(userID, dict())
    
    # { userID : [(ocena1, film1), (ocena2, film2)...] }
    def init_user_reviews(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = zip(movieIDsForUser, ratingsForUser)

            self.users_reviews[userID] = dict(pairs_list)

    def get_user_reviews_as_list(self, userID):
        if not self.users_reviews_list:
            self.init_user_reviews_as_list()

        return self.users_reviews_list.get(userID, [])

    def init_user_reviews_as_list(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = list(zip(movieIDsForUser, ratingsForUser))

            self.users_reviews_list[userID] = list(pairs_list)

    def init_movie_reviews(self):
        for movieID, movie_df in self.df.groupby("movieID", sort=False):
            ratingsForMovie = movie_df["rating"].to_list()
            userIDsForMovie = movie_df["userID"].to_list()

            pairs = zip(userIDsForMovie, ratingsForMovie)

            self.movie_reviews[movieID] = dict(pairs)

    def get_review_for_user(self, movieID, userID):
        return self.get_user_reviews(userID).get(movieID)
    
    def get_movie_reviews(self, movieID):
        if not self.movie_reviews:
            self.init_movie_reviews()

        return self.movie_reviews.get(movieID, dict())
    
    def most_popular_shows(self, n=10):
        most_pop = []

        for movie in self.df["movieID"].unique():
            cnt = self.len_ratings_for_movieid(movie)
            most_pop.append((movie, cnt))

        most_pop.sort(key=lambda x: x[1], reverse=True)
        return most_pop
    
    # k = ≈°tevilo splitov
    def split_data_for_cross(self, k, seed):
        shuffled_df = self.df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
        split_dfs = np.array_split(shuffled_df, k)
        return split_dfs
    
    def split_data_for_increment(self, k, seed=None):
        sorted_df = self.df.sort_values("date").reset_index(drop=True)
        split_dfs = np.array_split(sorted_df, k)
        return split_dfs

    # Da lahko direktno vstavljam df v uim brez __init__
    @classmethod
    def from_df(cls,df):
        obj = cls.__new__(cls) 
        obj.df = df

        obj.users_reviews = {}
        obj.movie_reviews = {}
        obj.users_reviews_list = {}
        obj.path = None
        obj.start_date = None
        obj.end_date = None
        obj.min_ratings = None

        return obj

    # Methods for MF with NN
    def split_data(self):
        train, test = train_test_split(self.df, test_size=0.2, random_state=42)
        return train, test

In [7]:
class MovieData:
    def __init__(self, path):
        self.df = pd.read_csv(path, sep="\t", encoding="latin1")

    def get_title(self, movieID):
        return self.df.loc[self.df['id'] == movieID]["title"].iloc[0]

In [8]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        pred = self.predictor.predict(userID)

        if not rec_seen:
            seen = self.uim.movieid_user_has_rated(userID)
            for movieid in seen:
                if movieid in pred:
                    pred.pop(movieid, None)

        recommends = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:n]

        return recommends
    
    def evaluate(self, test_data, top_n=20, printing=True):
        #userIDs = test_data.get_user_ids()
        #userIDs = set(test_data.get_user_ids()) & set(self.uim.get_user_ids())
        train_data = self.uim
        # Da uporabnike vedno obravnavam ob istem vrstnem redu
        userIDs = sorted(set(test_data.get_user_ids()) & set(train_data.get_user_ids()))

        count = 0
        stevc = 0.0
        sum_abs = 0.0
        
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        n_users = 0

        for user in userIDs:
            # MSE
            test_reviews = test_data.get_user_reviews(user) # {movieID: true_rating}
            if not test_reviews:
                continue

            pred = self.predictor # {movieID: predicted_rating}
            pred = pred.predict(user)

            for movieID, true_rating in test_reviews.items():
                if movieID not in pred:
                    continue

                error = pred[movieID] - true_rating
                stevc += ((error) ** 2)
                sum_abs += abs(error)
                count += 1

            # precision, recall, F1
            seen = train_data.movieid_user_has_rated(user)
            user_avg = sum(test_reviews.values()) / len(test_reviews)

            candidates = dict()
            relevant = set()
            recommends = set()

            for movieID, true_rating in test_reviews.items():
                if true_rating > user_avg:
                    relevant.add(movieID)

            # ƒçe nima relevant filmov, nima smisla raƒçunat recall/F1 za tega userja
            if len(relevant) == 0:
                continue

            for movieID, pred_rating in pred.items():
                if movieID not in seen:
                    candidates[movieID] = pred_rating

            # recommends_list = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:top_n]
            # ƒçe ima movieID isto oceno x[1] sortiri po movieID x[0]
            recommends_list = sorted(candidates.items(), key=lambda x: (-x[1], x[0]))[:top_n]
            
            for movieID, _ in recommends_list:
                recommends.add(movieID)

            tp = len(recommends & relevant)

            if len(recommends) > 0:
                precision = tp / len(recommends)
            else:
                precision = 0.0

            recall = tp / len(relevant)

            if (precision + recall) > 0:
                f1 = (2 * precision * recall) / (precision + recall)
            else:
                f1 = 0.0

            sum_precision += precision
            sum_recall += recall
            sum_f1 += f1
            n_users += 1

        # climax
        if count == 0:
            mse = 0.0
            mae = 0.0
            rmse = 0.0
        else:
            mse = stevc / count
            mae = sum_abs / count
            rmse = mse ** 0.5
            
        if n_users == 0:
            avg_precision = 0.0
            avg_recall = 0.0
            avg_f1 = 0.0
        else:
            avg_precision = sum_precision / n_users
            avg_recall = sum_recall / n_users
            avg_f1 = sum_f1 / n_users

        if printing:
            print("===== Evaluation =====")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {mse:.6f}")
            print(f"RMSE:       {rmse:.6f}")
            print(f"MAE:        {mae:.6f}")
            print(f"Precision:  {avg_precision:.6f}")
            print(f"Recall:     {avg_recall:.6f}")
            print(f"F1:         {avg_f1:.6f}")
            print("======================")
        return mse, rmse, mae, avg_precision, avg_recall, avg_f1
    
    def incremental_testing(self, uim_full, predictor, top_n=20, k=10, seed=42, printing=True):
        dfs = uim_full.split_data_for_increment(k, seed)
        sum_mse = 0.0
        sum_rmse = 0.0
        sum_mae = 0.0
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        count = 0

        for i in range(1, len(dfs)):
            # Izvajamo teste za vsak df
            test_df = dfs[i]
            train_dfs = dfs[:i]

            train_df = pd.concat(train_dfs)

            uim_test = UserItemData.from_df(test_df)
            uim_train = UserItemData.from_df(train_df)

            rec = Recommender(predictor())
            rec.fit(uim_train)

            mse_i, rmse_i, mae_i, p_i, r_i, f1_i = rec.evaluate(uim_test, top_n, False)

            sum_mse += mse_i
            sum_rmse += rmse_i
            sum_mae += mae_i
            sum_precision += p_i
            sum_recall += r_i
            sum_f1 += f1_i
            count += 1

        if count == 0:
            return (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        sum_mse /= count
        sum_rmse /= count
        sum_mae /= count
        sum_precision /= count
        sum_recall /= count
        sum_f1 /= count

        if printing:
            print("== INCREMENT TESTING ==")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {sum_mse:.6f}")
            print(f"RMSE:       {sum_rmse:.6f}")
            print(f"MAE:        {sum_mae:.6f}")
            print(f"Precision:  {sum_precision:.6f}")
            print(f"Recall:     {sum_recall:.6f}")
            print(f"F1:         {sum_f1:.6f}")
            print("=======================")
    
        return sum_mse, sum_rmse, sum_mae, sum_precision, sum_recall, sum_f1
    

    def cross_validation(self, uim_full, predictor, top_n=20, k=10, seed=42, printing=True):
        dfs = uim_full.split_data_for_cross(k, seed)
        sum_mse = 0.0
        sum_rmse = 0.0
        sum_mae = 0.0
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        # remember k je count

        for i in range(k):
            # Izvajamo teste za vsak df
            test_df = dfs[i]

            train_dfs = []

            for j in range(k):
                if j == i:
                    continue
                train_dfs.append(dfs[j])

            train_df = pd.concat(train_dfs)

            uim_test = UserItemData.from_df(test_df)
            uim_train = UserItemData.from_df(train_df)

            rec = Recommender(predictor())
            rec.fit(uim_train)

            mse_i, rmse_i, mae_i, p_i, r_i, f1_i = rec.evaluate(uim_test, top_n, False)

            sum_mse += mse_i
            sum_rmse += rmse_i
            sum_mae += mae_i
            sum_precision += p_i
            sum_recall += r_i
            sum_f1 += f1_i

        sum_mse /= k
        sum_rmse /= k
        sum_mae /= k
        sum_precision /= k
        sum_recall /= k
        sum_f1 /= k

        if printing:
            print("== CROSS VALIDATION ==")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {sum_mse:.6f}")
            print(f"RMSE:       {sum_rmse:.6f}")
            print(f"MAE:        {sum_mae:.6f}")
            print(f"Precision:  {sum_precision:.6f}")
            print(f"Recall:     {sum_recall:.6f}")
            print(f"F1:         {sum_f1:.6f}")
            print("======================")
    
        return sum_mse, sum_rmse, sum_mae, sum_precision, sum_recall, sum_f1


In [9]:
# Matrix factorization
class MatrixFactorizationPredictor:
    def __init__(self, k = 30, lr = 0.01, reg = 0.05, epochs = 20):
        self.items = None

        # k (≈°t. latentnih faktorjev): koliko ‚Äúskritih okusov‚Äù model dovoli. Veƒç k
        # k ‚Üí bolj fleksibilen model, veƒçja nevarnost overfittinga, poƒçasneje uƒçenje.
        self.k = k

        # ùõæ (learning rate): kako velik korak naredi≈° pri SGD. Prevelik ‚Üí eksplodira, premajhen ‚Üí se uƒçi sto let.
        self.lr = lr

        # Œª (regularizacija): kaznuje velike parametre. Premajhen ‚Üí overfitting, prevelik ‚Üí vse splo≈°ƒçi proti povpreƒçju.
        self.reg = reg

        # T (epochs): kolikokrat gre≈° ƒçez vse ocene. Premalo ‚Üí underfitting, preveƒç ‚Üí overfitting (ƒçe Œª ni dovolj).
        self.epochs = epochs

        self.user2idx = dict()
        self.idx2user = list()
        self.movie2idx = dict()
        self.idx2movie = list()

        self.mu = None
        self.U = None
        self.I = None
        self.P = None
        self.Q = None

        self.bu = None
        self.bi = None

        self.omega = list()

    def fit(self, X):
        self.items = X

        self.user2idx.clear()
        self.idx2user.clear()
        self.movie2idx.clear()
        self.idx2movie.clear()
        self.init_all_params()
        
        if (not(
            len(self.P) == self.U and len(self.P[0]) == self.k and
            len(self.Q) == self.I and len(self.Q[0]) == self.k and
            len(self.bu) == self.U and
            len(self.bi) == self.I and
            len(self.user2idx) == self.U and
            len(self.movie2idx) == self.I )):
            print("MF init FAILED")
            return

        self.omega.clear()
        for userID in self.users:
            userID_ratings = self.items.get_user_reviews_as_list(userID)
            u_idx = self.user2idx[userID]
            for item in userID_ratings:
                i_idx = self.movie2idx[item[0]]
                self.omega.append((u_idx, i_idx, item[1]))

        assert len(self.omega) == self.items.nratings()

        # SGD - stohastiƒçni gradientni spust
        for epoch in range(self.epochs):
            np.random.shuffle(self.omega)
            for triplet in self.omega:
                u, i, r = triplet

                dot_product_P_Q = 0.0
                # OBSTAJA TUDI np.dot(self.P[u], self.Q[i]) naslednjiƒç za dot producte listov
                for f in range(self.k):
                    dot_product_P_Q += self.P[u][f] * self.Q[i][f]

                r_pred = self.mu + self.bu[u] + self.bi[i] + dot_product_P_Q
                error = r - r_pred

                # bias update
                self.bu[u] += self.lr * (error - self.reg * self.bu[u])
                self.bi[i] += self.lr * (error - self.reg * self.bi[i])

                # factors update
                for f in range(self.k):
                    p_uf = self.P[u][f]
                    q_if = self.Q[i][f]

                    self.P[u][f] += self.lr * (error * q_if - self.reg * p_uf)
                    self.Q[i][f] += self.lr * (error * p_uf - self.reg * q_if)
    
    def init_all_params(self):
        self.mu = self.items.average_rating()
        self.users = self.items.get_user_ids()
        self.movies = self.items.get_movie_ids()

        # ≈†t uporabnikov
        self.U = len(self.users)
        # ≈†t filmov
        self.I = len(self.movies)

        for i in range(self.U):
            userID = self.users[i]
            self.user2idx[userID] = i 
            self.idx2user.append(userID)

        for j in range(self.I):
            movieID = self.movies[j]
            self.movie2idx[movieID] = j 
            self.idx2movie.append(movieID)
    
        # P = U x k, z random vrednostimi
        self.P = [np.random.normal(0.0, 0.1, self.k) for i in range(self.U)]
        # Q = I x k, z random vrednostimi
        self.Q = [np.random.normal(0.0, 0.1, self.k) for i in range(self.I)]

        # bias uporabnika u
        self.bu = [0 for i in range(self.U)]
        # bias filma i
        self.bi = [0 for i in range(self.I)]

    def predict(self, userID):
        pred = {}
        known_user = userID in self.user2idx
        u = self.user2idx[userID] if known_user else None

        for i in range(self.I):
            movieID = self.idx2movie[i]

            if not known_user:
                pred[movieID] = self.mu + self.bi[i]
                continue

            dot = float(np.dot(self.P[u], self.Q[i]))
            pred[movieID] = self.mu + self.bu[u] + self.bi[i] + dot

        return pred

In [18]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = MatrixFactorizationPredictor(k=30, lr=0.01, reg=0.05, epochs=20)
rec = Recommender(rp)
rec.fit(uim)

rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: The Usual Suspects, ocena: 4.396858563374115
Film: Shichinin no samurai, ocena: 4.231762555929509
Film: Sin City, ocena: 4.199868901405319
Film: The Silence of the Lambs, ocena: 4.168903310641527
Film: Rain Man, ocena: 4.022799273304572
Film: The Incredibles, ocena: 3.95708736201697
Film: Indiana Jones and the Last Crusade, ocena: 3.9260468770180514
Film: Good Will Hunting, ocena: 3.9119279635697377
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 3.899087176617178
Film: A Beautiful Mind, ocena: 3.881534687948428
Film: Catch Me If You Can, ocena: 3.8617874334184457
Film: Monsters, Inc., ocena: 3.8609160698391993
Film: The Lord of the Rings: The Return of the King, ocena: 3.848537339228621
Film: Die Hard, ocena: 3.840696251998754
Film: Finding Nemo, ocena: 3.83274066990011


In [None]:
# ≈†e zame 72000
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = MatrixFactorizationPredictor(k=30, lr=0.01, reg=0.05, epochs=20)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(72000, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: The Usual Suspects, ocena: 4.029337889009897
Film: The Lord of the Rings: The Return of the King, ocena: 4.012986389058724
Film: Schindler's List, ocena: 3.99906493986563
Film: Memento, ocena: 3.925710158880197
Film: Le fabuleux destin d'Am√©lie Poulain, ocena: 3.9195261610844727
Film: Saving Private Ryan, ocena: 3.8987664263092623
Film: Forrest Gump, ocena: 3.887178127226955
Film: Braveheart, ocena: 3.8816074805790586
Film: Gladiator, ocena: 3.877427497736347
Film: Good Will Hunting, ocena: 3.8739941526407065
Film: A Beautiful Mind, ocena: 3.85088469558263
Film: The Sixth Sense, ocena: 3.830253120900604
Film: The Godfather, ocena: 3.8291577386564013
Film: American Beauty, ocena: 3.815447519734811
Film: Shichinin no samurai, ocena: 3.775728064311421


In [None]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = MatrixFactorizationPredictor(k=30, lr=0.01, reg=0.05, epochs=20)
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.evaluate(uim_test, 20)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.cross_validation(
    uim_test, lambda: MatrixFactorizationPredictor(k=30, lr=0.01, reg=0.05, epochs=20),
    top_n=20, k=10, seed=42
)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.incremental_testing(
    uim_test, lambda: MatrixFactorizationPredictor(k=30, lr=0.01, reg=0.05, epochs=20),
    top_n=20, k=10, seed=42
)

===== Evaluation =====
Top-N:      20
MSE:        0.677068
RMSE:       0.822841
MAE:        0.607208
Precision:  0.093711
Recall:     0.128266
F1:         0.087592


  return bound(*args, **kwds)


== CROSS VALIDATION ==
Top-N:      20
MSE:        0.533023
RMSE:       0.730013
MAE:        0.545657
Precision:  0.077875
Recall:     0.643235
F1:         0.133432


  return bound(*args, **kwds)


== INCREMENT TESTING ==
Top-N:      20
MSE:        0.564826
RMSE:       0.750079
MAE:        0.557908
Precision:  0.054534
Recall:     0.461178
F1:         0.087208


### Matrix Factorization using Neural networks

In [54]:
# Matrix factorization in nerual networks
uim = UserItemData('data/user_ratedmovies.dat')
df = uim.df.copy()

# datetime -> unix seconds (ƒçe rabi≈°)
df["timestamp"] = df["date"].astype("int64") // 10**9

df["user_idx"], user_uniques = pd.factorize(df["userID"])
df["movie_idx"], movie_uniques = pd.factorize(df["movieID"])

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [55]:
user_uniques

Index([72000,    75,    78,   127,   170,   175,   190,   267,   325,   383,
       ...
       71331, 71420, 71478, 71483, 71487, 71497, 71509, 71525, 71529, 71534],
      dtype='int64', length=2114)

In [56]:
df

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second,date,ratings_count,timestamp,user_idx,movie_idx
0,72000,2571,4.0,20,6,2019,19,17,16,2019-06-20,1671,1560988800,0,0
1,72000,4993,4.0,20,6,2019,19,17,16,2019-06-20,1577,1560988800,0,1
2,72000,296,3.5,20,6,2019,19,17,16,2019-06-20,1538,1560988800,0,2
3,72000,5952,4.0,20,6,2019,19,17,16,2019-06-20,1529,1560988800,0,3
4,72000,480,3.0,20,6,2019,19,17,16,2019-06-20,1449,1560988800,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855616,71534,44555,4.0,3,12,2007,3,5,38,2007-12-03,294,1196640000,2113,1068
855617,71534,46578,4.0,3,12,2007,2,56,44,2007-12-03,731,1196640000,2113,513
855618,71534,48516,4.5,3,12,2007,2,53,46,2007-12-03,804,1196640000,2113,1196
855619,71534,61075,5.0,10,10,2008,9,56,5,2008-10-10,16,1223596800,2113,7487


In [57]:
# creating book embedding path
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(len(movie_uniques)+1, 5, name="Movie-Embedding")(movie_input)
movie_vec = Flatten(name="Flatten-Books")(movie_embedding)

# creating user embedding path
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(len(user_uniques)+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

# concatenate features
conc = Concatenate()([movie_vec, user_vec])

# add fully-connected-layers
# 128 nevronov
fc1 = Dense(128, activation='relu')(conc)
# 32 nevronov
fc2 = Dense(32, activation='relu')(fc1)
out = Dense(1)(fc2)

# Create model and compile it
model2 = Model([user_input, movie_input], out)
model2.compile('adam', 'mean_squared_error')

In [58]:
history = model2.fit(
    [train["user_idx"], train["movie_idx"]],
    train["rating"],
    epochs=5,
    verbose=1
)

Epoch 1/5


[1m21391/21391[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 683us/step - loss: 0.7060
Epoch 2/5
[1m21391/21391[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m14s[0m 655us/step - loss: 0.6273
Epoch 3/5
[1m21391/21391[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 689us/step - loss: 0.6078
Epoch 4/5
[1m21391/21391[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 707us/step - loss: 0.5966
Epoch 5/5
[1m21391/21391[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 680us/step - loss: 0.5880


In [59]:
predictions = model2.predict([test["user_idx"].head(10), test["movie_idx"].head(10)])
for i in range(10):
    print(float(predictions[i]), test["rating"].iloc[i])

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 52ms/step
3.755413055419922 3.5
3.138627529144287 2.0
3.6818456649780273 4.5
3.628620147705078 3.5
3.1930270195007324 3.5
2.7255492210388184 3.0
3.465062141418457 4.5
3.753598213195801 2.5
3.478011131286621 4.0
4.398620128631592 5.0


In [76]:
class NCFPredictor:
    def __init__(self, emb_dim=5, hidden1=128, hidden2=32,
                 lr=1e-3, epochs=5, batch_size=1024,
                 rating_min=1.0, rating_max=5.0, verbose=0):
        self.emb_dim = emb_dim
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.rating_min = rating_min
        self.rating_max = rating_max
        self.verbose = verbose

        self.model = None
        self.uim = None

        # mappingi
        self.user2idx = {}
        self.idx2user = []
        self.movie2idx = {}
        self.idx2movie = []

        self.mu = None

    def _build_mappings(self, df):
        self.user2idx.clear(); self.idx2user.clear()
        self.movie2idx.clear(); self.idx2movie.clear()

        # zvezni indeksi 0..N-1
        user_codes, user_uniques = pd.factorize(df["userID"], sort=False)
        movie_codes, movie_uniques = pd.factorize(df["movieID"], sort=False)

        # shrani mapiranja
        self.idx2user = list(user_uniques)
        self.idx2movie = list(movie_uniques)
        self.user2idx = {u:i for i,u in enumerate(self.idx2user)}
        self.movie2idx = {m:i for i,m in enumerate(self.idx2movie)}

        # vrni tudi kode za fit
        return user_codes.astype("int32"), movie_codes.astype("int32")

    def _build_model(self, n_users, n_items):
        user_input = Input(shape=(1,), name="User-Input")
        movie_input = Input(shape=(1,), name="Movie-Input")

        user_vec = Flatten(name="Flatten-Users")(
            Embedding(n_users, self.emb_dim, name="User-Embedding")(user_input)
        )
        movie_vec = Flatten(name="Flatten-Movies")(
            Embedding(n_items, self.emb_dim, name="Movie-Embedding")(movie_input)
        )

        conc = Concatenate()([movie_vec, user_vec])
        fc1 = Dense(self.hidden1, activation="relu")(conc)
        fc2 = Dense(self.hidden2, activation="relu")(fc1)
        out = Dense(1)(fc2)

        model = Model([user_input, movie_input], out)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(self.lr),
            loss="mean_squared_error",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"),
                     tf.keras.metrics.MeanAbsoluteError(name="mae")]
        )
        return model

    def fit(self, X):
        # X je UserItemData
        self.uim = X
        df = X.df.copy()
        self.mu = float(X.average_rating())

        user_idx, movie_idx = self._build_mappings(df)
        y = df["rating"].astype("float32").to_numpy()

        n_users = len(self.idx2user)
        n_items = len(self.idx2movie)

        self.model = self._build_model(n_users, n_items)

        # malo validacije za stabilnost
        self.model.fit(
            [user_idx, movie_idx],
            y,
            validation_split=0.1,
            epochs=self.epochs,
            batch_size=self.batch_size,
            verbose=self.verbose,
            shuffle=True
        )

    def predict(self, userID):
        """
        Vrne {movieID: score} za vse filme, ki jih model pozna.
        Cold-start user: vrne globalni avg za vse.
        """
        pred = {}

        # cold-start user
        if userID not in self.user2idx:
            for mid in self.idx2movie:
                pred[mid] = self.mu
            return pred

        u = self.user2idx[userID]
        n_items = len(self.idx2movie)

        user_vec = np.full((n_items,), u, dtype=np.int32)
        item_vec = np.arange(n_items, dtype=np.int32)

        scores = self.model.predict([user_vec, item_vec], batch_size=4096, verbose=0).reshape(-1)
        scores = np.clip(scores, self.rating_min, self.rating_max)

        for i, movieID in enumerate(self.idx2movie):
            pred[movieID] = float(scores[i])

        return pred

In [64]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)

rp = NCFPredictor(emb_dim=10, hidden1=128, hidden2=32, lr=1e-3, epochs=5, batch_size=2048, verbose=0)
rec = Recommender(rp)
rec.fit(uim)

rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print(f"Film: {md.get_title(idmovie)}, ocena: {val}")

Film: The Usual Suspects, ocena: 4.364716053009033
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.17069673538208
Film: The Lord of the Rings: The Return of the King, ocena: 4.1641693115234375
Film: The Silence of the Lambs, ocena: 4.149110317230225
Film: Shichinin no samurai, ocena: 4.130641460418701
Film: The Lord of the Rings: The Two Towers, ocena: 4.105669975280762
Film: Indiana Jones and the Last Crusade, ocena: 4.01454496383667
Film: The Incredibles, ocena: 3.9941794872283936
Film: Sin City, ocena: 3.990361452102661
Film: Good Will Hunting, ocena: 3.9900593757629395
Film: A Beautiful Mind, ocena: 3.9706780910491943
Film: Batman Begins, ocena: 3.9615061283111572
Film: Rain Man, ocena: 3.940523147583008
Film: Finding Nemo, ocena: 3.9392282962799072
Film: Monsters, Inc., ocena: 3.9288179874420166


In [65]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)

rp = NCFPredictor(emb_dim=10, hidden1=128, hidden2=32, lr=1e-3, epochs=5, batch_size=2048, verbose=0)
rec = Recommender(rp)
rec.fit(uim)

rec_items = rec.recommend(72000, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print(f"Film: {md.get_title(idmovie)}, ocena: {val}")

Film: The Godfather, ocena: 4.152354717254639
Film: The Usual Suspects, ocena: 4.0861124992370605
Film: Le fabuleux destin d'Am√©lie Poulain, ocena: 4.027885913848877
Film: Memento, ocena: 3.9926304817199707
Film: Schindler's List, ocena: 3.967477560043335
Film: Monty Python and the Holy Grail, ocena: 3.9452831745147705
Film: American Beauty, ocena: 3.9297618865966797
Film: Raiders of the Lost Ark, ocena: 3.9228501319885254
Film: The Lord of the Rings: The Return of the King, ocena: 3.890263319015503
Film: Eternal Sunshine of the Spotless Mind, ocena: 3.8876638412475586
Film: Blade Runner, ocena: 3.87583327293396
Film: The Silence of the Lambs, ocena: 3.872387170791626
Film: Shichinin no samurai, ocena: 3.866442918777466
Film: Fargo, ocena: 3.8330790996551514
Film: Reservoir Dogs, ocena: 3.82381534576416


In [None]:
uim_full = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)

rp = NCFPredictor(emb_dim=10, hidden1=128, hidden2=32, lr=1e-3, epochs=5, batch_size=2048, verbose=0)
rec = Recommender(rp)

rec.fit(uim_full)
mse, rmse, mae, precision, recall, f1 = rec.evaluate(uim_full, top_n=20)


===== Evaluation =====
Top-N:      20
MSE:        0.713145
RMSE:       0.844479
MAE:        0.635274
Precision:  0.019750
Recall:     0.073861
F1:         0.027108


In [77]:
train_uim = UserItemData('data/user_ratedmovies.dat', min_ratings=200, end_date='2.1.2008')
test_uim  = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')

rp = NCFPredictor(emb_dim=10, hidden1=128, hidden2=32, lr=1e-3, epochs=5, batch_size=2048, verbose=0)
rec = Recommender(rp)

rec.fit(train_uim)
mse, rmse, mae, precision, recall, f1 = rec.evaluate(test_uim, top_n=20)

===== Evaluation =====
Top-N:      20
MSE:        0.706072
RMSE:       0.840281
MAE:        0.628556
Precision:  0.022834
Recall:     0.079833
F1:         0.030928


In [79]:
uim_cv = UserItemData('data/user_ratedmovies.dat',
                     min_ratings=200,
                     start_date='2.1.2008')

rec.cross_validation(
    uim_cv,
    predictor=lambda: NCFPredictor(
        emb_dim=10,
        hidden1=128,
        hidden2=32,
        lr=1e-3,
        epochs=3,        # CV je drag ‚Üí 3 je realno
        batch_size=2048,
        verbose=0
    ),
    top_n=20,
    k=10,
    seed=42,
    printing=True
)

== CROSS VALIDATION ==
Top-N:      20
MSE:        8.827227
RMSE:       2.970953
MAE:        2.847012
Precision:  0.072387
Recall:     0.562146
F1:         0.123280


(8.827227041199144,
 2.9709533922275098,
 2.847011765240405,
 0.07238681810308,
 0.5621458685843274,
 0.12328011461679253)

In [78]:
uim_inc = UserItemData('data/user_ratedmovies.dat',
                      min_ratings=200,
                      start_date='2.1.2008')

rec.incremental_testing(
    uim_inc,
    predictor=lambda: NCFPredictor(
        emb_dim=10,
        hidden1=128,
        hidden2=32,
        lr=1e-3,
        epochs=3,
        batch_size=2048,
        verbose=0
    ),
    top_n=20,
    k=10,
    seed=42,
    printing=True
)

== INCREMENT TESTING ==
Top-N:      20
MSE:        8.246907
RMSE:       2.871217
MAE:        2.749866
Precision:  0.036582
Recall:     0.242158
F1:         0.056109


(8.246906833722438,
 2.871217411788002,
 2.749865850636323,
 0.03658220895653988,
 0.24215845715661807,
 0.05610888330588409)