## Priporočilni sistem

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import random

In [56]:
# To je za osebno testiranje baze
df = pd.read_csv("data/user_ratedmovies.dat", sep="\t")

df_unique_movieID = df.drop_duplicates(subset=["movieID"])
df_unique_movieID

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,72000,2571,4.0,20,6,2019,19,17,16
1,72000,4993,4.0,20,6,2019,19,17,16
2,72000,296,3.5,20,6,2019,19,17,16
3,72000,5952,4.0,20,6,2019,19,17,16
4,72000,480,3.0,20,6,2019,19,17,16
...,...,...,...,...,...,...,...,...,...
850667,71331,62796,3.0,19,11,2008,23,10,40
851718,71420,3585,4.0,22,11,2007,22,43,7
852676,71420,59065,1.5,30,10,2008,15,41,38
853204,71483,4421,2.5,2,9,2003,22,18,25


Dodajanje stolpcev date, ki ga sestavimo iz podatkov in ratings_count, kjer prešejemo število ocen za posamezen film (movieID)

Tle sm meu težave, ker moram naprej sortirati po datumu in potem šele po ratingih

In [57]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=None):
        self.path = path
        self.start_date = pd.to_datetime(start_date, dayfirst=True) if start_date else None
        self.end_date = pd.to_datetime(end_date, dayfirst=True) if end_date else None
        self.min_ratings = min_ratings
        self.df = self.init_df()
        self.users_reviews = dict()
        self.movie_reviews = dict()
        self.users_reviews_list = dict()

    def init_df(self):
        df = pd.read_csv(self.path, sep="\t", encoding="latin1")
        df["date"] = pd.to_datetime( df["date_day"].astype(str) + "." + df["date_month"].astype(str) + "." + df["date_year"].astype(str), format="%d.%m.%Y", dayfirst=True)

        if self.start_date is not None:
            df = df[df["date"] >= self.start_date]

        if self.end_date is not None:
            df = df[df["date"] < self.end_date]

        df["ratings_count"] = df.groupby("movieID")["rating"].transform("count")

        if self.min_ratings is not None:
            df = df[df["ratings_count"] >= self.min_ratings]

        return df

    def nratings(self):
        return len(self.df)
    
    def get_movie_ids(self):
        return self.df["movieID"].unique()
    
    def get_user_ids(self):
        return self.df["userID"].unique()
    
    def sum_ratings_for_movie(self, movieid):
        vs = sum(self.df[self.df["movieID"] == movieid]["rating"])
        return vs
    
    def nratings_for_movie(self, movieid):
        n = self.df[self.df["movieID"] == movieid].shape[0]
        return n
    
    def average_rating(self):
        return sum(self.df["rating"]) / self.nratings()
    
    def movieid_user_has_rated(self, userid):
        res = set(self.df[self.df["userID"] == userid]["movieID"])
        return res
    
    # za ViewsPredictor
    def len_ratings_for_movieid(self, movieid):
        res = len(self.df[self.df["movieID"] == movieid])
        return res
    
    # za STDPredictor
    def ratings_for_movieid(self, movieid):
        res = list(self.df[self.df["movieID"] == movieid]["rating"])
        return res
    
    # def get_user_reviews(self,userID): # vrne knjižnico za uporabnika { film: ocena_filma ...}
    #     user_df = self.df[self.df["userID"] == userID]
    #     rated = dict()

    #     # for idx, row in df.iterrows(): samo za naprej
    #     for _, row in user_df.iterrows():
    #         rated[row["movieID"]] = row["rating"]

    #     return rated
    # POSPEŠITEV
    def get_user_reviews(self, userID):
        if not self.users_reviews:
            self.init_user_reviews()

        return self.users_reviews.get(userID, dict())
    
    # Pospešitev, ker je program enostavno predolgo rabil 
    # { userID : [(ocena1, film1), (ocena2, film2)...] }
    def init_user_reviews(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = zip(movieIDsForUser, ratingsForUser)

            self.users_reviews[userID] = dict(pairs_list)

    def get_user_reviews_as_list(self, userID):
        if not self.users_reviews_list:
            self.init_user_reviews_as_list()

        return self.users_reviews_list.get(userID, [])

    def init_user_reviews_as_list(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = list(zip(movieIDsForUser, ratingsForUser))

            self.users_reviews_list[userID] = list(pairs_list)

    def init_movie_reviews(self):
        for movieID, movie_df in self.df.groupby("movieID", sort=False):
            ratingsForMovie = movie_df["rating"].to_list()
            userIDsForMovie = movie_df["userID"].to_list()

            pairs = zip(userIDsForMovie, ratingsForMovie)

            self.movie_reviews[movieID] = dict(pairs)
            

    def get_review_for_user(self, movieID, userID):
        return self.get_user_reviews(userID).get(movieID)
    
    # def get_movie_reviews(self,movieID): # vrne knjižnico za filem { uporabnik: ocena_filma ...}
    #     user_df = self.df[self.df["movieID"] == movieID]
    #     rated = dict()

    #     # for idx, row in df.iterrows(): samo za naprej
    #     for _, row in user_df.iterrows():
    #         rated[row["userID"]] = row["rating"]

    #     return rated
    # POSPEŠITEV
    def get_movie_reviews(self, movieID):
        if not self.movie_reviews:
            self.init_movie_reviews()

        return self.movie_reviews.get(movieID, dict())
    
    def most_popular_shows(self, n=10):
        most_pop = []

        for movie in self.df["movieID"].unique():
            cnt = self.len_ratings_for_movieid(movie)
            most_pop.append((movie, cnt))

        most_pop.sort(key=lambda x: x[1], reverse=True)
        return most_pop

In [58]:
class MovieData:
    def __init__(self, path):
        self.df = pd.read_csv(path, sep="\t", encoding="latin1")

    def get_title(self, movieID):
        return self.df.loc[self.df['id'] == movieID]["title"].iloc[0]

In [59]:
class RandomPredictor:
    def __init__(self, minRating, maxRating):
        self.minRating = minRating
        self.maxRating = maxRating
        self.items = None
        
    def predict(self, userid):
        myDict = dict()
        for item in self.items:
            myDict[item] = random.randint(self.minRating, self.maxRating)

        return myDict

    def fit(self, X):
        self.items = X.get_movie_ids()

In [60]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 3
Film: Grumpy Old Men, ocena: 5
Film: Money Train, ocena: 5
Film: The Usual Suspects, ocena: 4
Film: City Hall, ocena: 1


In [61]:
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855620
73584


In [62]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))


<class 'dict'>
Film: Toy story, ocena: 4
Film: Grumpy Old Men, ocena: 1
Film: Money Train, ocena: 5
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 3


In [None]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        pred = self.predictor.predict(userID)

        if not rec_seen:
            seen = self.uim.movieid_user_has_rated(userID)
            for movieid in seen:
                if movieid in pred:
                    pred.pop(movieid, None)

        recommends = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:n]

        return recommends
    
    def evaluate(self, test_data, top_n):
        #userIDs = test_data.get_user_ids()
        userIDs = set(test_data.get_user_ids()) & set(self.uim.get_user_ids())

        count = 0
        stevc = 0.0
        sum_abs = 0.0
        
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        n_users = 0

        for user in userIDs:
            # MSE
            test_reviews = test_data.get_user_reviews(user) # {movieID: true_rating}
            if not test_reviews:
                continue

            pred = self.predictor # {movieID: predicted_rating}
            pred = pred.predict(user)

            for movieID, true_rating in test_reviews.items():
                if movieID not in pred:
                    continue

                error = pred[movieID] - true_rating
                stevc += ((error) ** 2)
                sum_abs += abs(error)
                count += 1

            # precision, recall, F1
            seen = self.uim.movieid_user_has_rated(user)
            user_avg = sum(test_reviews.values()) / len(test_reviews)

            candidates = dict()
            relevant = set()
            recommends = set()

            for movieID, true_rating in test_reviews.items():
                if true_rating > user_avg:
                    relevant.add(movieID)

            # če nima relevant filmov, nima smisla računat recall/F1 za tega userja
            if len(relevant) == 0:
                continue

            for movieID, pred_rating in pred.items():
                if movieID not in seen:
                    candidates[movieID] = pred_rating

            recommends_list = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:top_n]
            
            for movieID, _ in recommends_list:
                recommends.add(movieID)

            tp = len(recommends & relevant)

            if len(recommends) > 0:
                precision = tp / len(recommends)
            else:
                precision = 0.0

            recall = tp / len(relevant)

            if (precision + recall) > 0:
                f1 = (2 * precision * recall) / (precision + recall)
            else:
                f1 = 0.0

            sum_precision += precision
            sum_recall += recall
            sum_f1 += f1
            n_users += 1

        # climax
        if count == 0:
            mse = 0.0
            mae = 0.0
            rmse = 0.0
        else:
            mse = stevc / count
            mae = sum_abs / count
            rmse = mse ** 0.5
            
        if n_users == 0:
            avg_precision = 0.0
            avg_recall = 0.0
            avg_f1 = 0.0
        else:
            avg_precision = sum_precision / n_users
            avg_recall = sum_recall / n_users
            avg_f1 = sum_f1 / n_users

        print("===== Evaluation =====")
        print(f"Top-N:      20")
        print(f"MSE:        {mse:.6f}")
        print(f"RMSE:       {rmse:.6f}")
        print(f"MAE:        {mae:.6f}")
        print(f"Precision:  {avg_precision:.6f}")
        print(f"Recall:     {avg_recall:.6f}")
        print(f"F1:         {avg_f1:.6f}")
        print("======================")
        return mse, rmse, mae, avg_precision, avg_recall, avg_f1
    
    def cross_validation(self):
        return
    
    def splitter(self, test_data, top_n, ):
        return
    
    def incremental_testing(self):
        return

In [64]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
        self.items = None
        self.myDict = dict()


    # vs je vsota vseh ocen za ta film,
    # n je število ocen, ki jih je ta film dobil,
    # g_avg je povprečje čez vse filme,
    # b je parameter formule za povprečje. Če je b=0, gre za navadno povprečje.
    def fit(self, X):
        self.items = X.get_movie_ids()

        g_avg = X.average_rating()

        for ele in self.items:
            n = X.nratings_for_movie(ele)
            vs = X.sum_ratings_for_movie(ele)
            
            avg = (vs + self.b * g_avg) / (n + self.b)
            self.myDict[ele] = avg

    def predict(self, user_id): 
        return self.myDict

In [65]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
ap = AveragePredictor(b=0)
rec = Recommender(ap)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.evaluate(uim_test, 20)

rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

===== Evaluation =====
Top-N:      20
MSE:        0.843488
RMSE:       0.918416
MAE:        0.676760
Precision:  0.094010
Recall:     0.128272
F1:         0.087932
Film: The Usual Suspects, ocena: 4.30189620758483
Film: The Lord of the Rings: The Return of the King, ocena: 4.121363258520366
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.093654434250764
Film: The Silence of the Lambs, ocena: 4.0639032815198615
Film: The Lord of the Rings: The Two Towers, ocena: 4.0432539682539685


In [104]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
ap = AveragePredictor(b=100)
rec = Recommender(ap)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.evaluate(uim_test, 20)

rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

===== Evaluation =====
Top-N:      20
MSE:        0.841145
RMSE:       0.917140
MAE:        0.675273
Precision:  0.094010
Recall:     0.128272
F1:         0.087932
Film: The Usual Suspects, ocena: 4.267236644964127
Film: The Lord of the Rings: The Return of the King, ocena: 4.105905435725609
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.081317317294367
Film: The Silence of the Lambs, ocena: 4.052460081677638
Film: The Lord of the Rings: The Two Towers, ocena: 4.034187340257698


In [67]:
class ViewsPredictor:
    def __init__(self):
        self.items = None
        self.myDict = dict()

    def fit(self, X):
        self.items = X.get_movie_ids()

        for ele in self.items:
            self.myDict[ele] = X.len_ratings_for_movieid(ele)

    def predict(self, user_id):
        return self.myDict

In [68]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
ap = ViewsPredictor()
rec = Recommender(ap)
rec.fit(uim)

rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1308
Film: The Lord of the Rings: The Two Towers, ocena: 1260
Film: The Lord of the Rings: The Return of the King, ocena: 1203
Film: The Silence of the Lambs, ocena: 1158
Film: Shrek, ocena: 1145


In [69]:
class STDPredictor:
    def __init__(self, n):
        self.items = None
        self.myDict = dict()
        self.n = n

    def fit(self, X):
        self.items = X.get_movie_ids()

        for ele in self.items:
            n_ele = X.len_ratings_for_movieid(ele)
            if n_ele >= self.n:
                ratings = X.ratings_for_movieid(ele)
                ratings_std = np.std(ratings)
                self.myDict[ele] = ratings_std

    def predict(self, user_id):
        return self.myDict

In [70]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)

rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Titanic, ocena: 1.083712389320861
Film: The Lord of the Rings: The Two Towers, ocena: 0.9309035799601727
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 0.9186220450348818
Film: The Lord of the Rings: The Return of the King, ocena: 0.9148179639208237
Film: Shrek, ocena: 0.8565205807196439


In [100]:
class ItemBasedPredictor:
    def __init__(self,min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
        self.items = None
        self.similarities = dict()
        self.user_avg = dict()
        # self.id2idx = dict()

    def fit(self, X):
        self.items = X
        movieIDs = X.get_movie_ids()

        for userID in X.get_user_ids():
            reviews = X.get_user_reviews(userID).values()

            self.user_avg[userID] = sum(reviews) / len(reviews)

        for i in range(len(movieIDs)):
            p1 = movieIDs[i]
            if p1 not in self.similarities:
                self.similarities[p1] = dict()

            for j in range(i+1, len(movieIDs)):
                p2 = movieIDs[j]
                sim = self.similarity(p1, p2)

                if sim > 0:
                    self.similarities[p1][p2] = sim

                    if p2 not in self.similarities:
                        self.similarities[p2] = dict()
                    self.similarities[p2][p1] = sim

    def predict(self, user_id):
        predictions = dict()
        movieIDs = list(self.items.get_movie_ids())  # set
        rated = self.items.get_user_reviews(user_id)
        u_avg = self.user_avg[user_id]
        
        for movie1 in movieIDs:
            imenovalec = 0
            stevec = 0
            
            if movie1 in self.similarities and movie1 not in rated:
                similaritiesForMovie = self.similarities[movie1]
                
                for movie2, movie12sim in similaritiesForMovie.items():

                    if movie2 not in rated:
                        continue

                    userRating = rated[movie2]
                    imenovalec += movie12sim * (userRating - u_avg)
                    stevec += movie12sim

            if stevec != 0:
                predictions[movie1] = (imenovalec / stevec) + u_avg
            else:
                # predictions[movie1] = 0
                predictions[movie1] = u_avg

        return predictions

    def similarity(self, p1, p2):
        p1_reviews = self.items.get_movie_reviews(p1)
        p2_reviews = self.items.get_movie_reviews(p2)

        presek_userIDs = p1_reviews.keys() & p2_reviews.keys()

        if len(presek_userIDs) < self.min_values:
            return 0

        imenovalc = 0
        a = 0
        b = 0
        for userIDs in presek_userIDs:
            u_avg = self.user_avg[userIDs]

            prvi = p1_reviews[userIDs] - u_avg
            drugi = p2_reviews[userIDs] - u_avg

            imenovalc += prvi * drugi

            a += prvi * prvi
            b += drugi * drugi

        stevc = (a**0.5) * (b**0.5)

        if stevc <= 0:
            return 0
        
        sim = imenovalc / stevc 
            
        if sim < self.threshold:
            return 0

        return sim

    def get_20_most_similar_movies(self):
        pairs = []

        for movie1 in self.similarities:
            for movie2, sim in self.similarities[movie1].items():
                pairs.append((movie1, movie2, sim))

        pairs.sort(key=lambda x: x[2], reverse=True)
        return pairs[:20]
    
    def similarItems(self, item, n):
        pairs = []

        for movie2, sim in self.similarities[item].items():
            pairs.append((movie2, sim))

        pairs.sort(key=lambda x: x[1], reverse=True)
        return pairs[:n]

Tle sem imel še idejo z implementacijo z 2d matriko podobnosti, samo sem ugotovil, da ni najbolj optimalen pristop, ker se shrani tudi informacija, da je podobnost 0. Medtem, ko uporaba dict v dict je ne in je hitreje za iterirati po.

In [72]:
def fit(self, X): 
    self.items = X 
    movieIDs = X.get_movie_ids() 

    for userID in X.get_user_ids(): 
        reviews = X.get_user_reviews(userID).values() 
        self.user_avg[userID] = sum(reviews) / len(reviews) # similarity matrika id2idx = dict() 

        # mapiranje movieID -> index 
        N = len(movieIDs) 
        self.simM = np.zeros((N, N), dtype=float) 
        for i in range(len(movieIDs)): 
            p1 = movieIDs[i] 
            id2idx[movieIDs[i]] = i 

            for j in range(i+1, len(movieIDs)): 
                p2 = movieIDs[j] 
                sim = self.similarity(p1, p2) 

                if sim > 0: 
                    self.simM[i, j] = sim 

    self.simM[j, i] = sim 
    self.id2idx = id2idx 
    
    print(self.simM)

In [90]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.evaluate(uim_test, 20)

===== Evaluation =====
Top-N:      20
MSE:        0.743294
RMSE:       0.862145
MAE:        0.640727
Precision:  0.092741
Recall:     0.125707
F1:         0.086231


In [91]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))


Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395523176756636
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.4246612584468761


In [92]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.3557347903101595
Film: The Usual Suspects, ocena: 4.354681728067836
Film: The Silence of the Lambs, ocena: 4.335305303472516
Film: Sin City, ocena: 4.278687166899101
Film: Monsters, Inc., ocena: 4.218174964881665
Film: The Incredibles, ocena: 4.206923580039978
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152687453736568
Film: Batman Begins, ocena: 4.146383470969743
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.071535242958551
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.015142490064839
Film: Good Will Hunting, ocena: 4.0092808069228205
Film: The Lord of the Rings: The Two Towers, ocena: 3.9412868735495064
Film: Indiana Jones and the Last Crusade, ocena: 3.796976496378924


In [93]:
rez = rp.get_20_most_similar_movies()

for i in rez:
    print(f"Film1: {md.get_title(i[1])}, Film2: {md.get_title(i[0])}, podobnost: {i[2]}")

Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8232125721053811
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8232125721053811
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8079374897442505
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442505
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381035
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381035
Film1: Star Wars: Episod

In [94]:
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8232125721053811
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442505
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.2397509860687844
Film: Star Wars, ocena: 0.21891117021560533
Film: The Matrix, ocena: 0.21526030820703024
Film: Raiders of the Lost Ark, ocena: 0.19944276706345027
Film: The Usual Suspects, ocena: 0.18321188451910747
Film: Blade Runner, ocena: 0.16399681315410272
Film: Schindler's List, ocena: 0.1610590513814871
Film: Monty Python and the Holy Grail, ocena: 0.15780453798519112


In [95]:
# To sem samo naredil izpis zase da vidim kateri filmi so v zbirki
myItems = uim.most_popular_shows(100)
print(len(myItems))
for idmovie, val in myItems:
    print("movieID: {}, Film: {}, n_ocen: {}".format(idmovie,md.get_title(idmovie), val))

81
movieID: 2571, Film: The Matrix, n_ocen: 1671
movieID: 4993, Film: The Lord of the Rings: The Fellowship of the Ring, n_ocen: 1577
movieID: 356, Film: Forrest Gump, n_ocen: 1568
movieID: 296, Film: Pulp Fiction, n_ocen: 1538
movieID: 5952, Film: The Lord of the Rings: The Two Towers, n_ocen: 1529
movieID: 2858, Film: American Beauty, n_ocen: 1472
movieID: 7153, Film: The Lord of the Rings: The Return of the King, n_ocen: 1457
movieID: 480, Film: Jurassic Park, n_ocen: 1449
movieID: 318, Film: The Shawshank Redemption, n_ocen: 1441
movieID: 2959, Film: Fight Club, n_ocen: 1435
movieID: 593, Film: The Silence of the Lambs, n_ocen: 1431
movieID: 4306, Film: Shrek, n_ocen: 1405
movieID: 2762, Film: The Sixth Sense, n_ocen: 1395
movieID: 1270, Film: Back to the Future, n_ocen: 1378
movieID: 260, Film: Star Wars, n_ocen: 1366
movieID: 780, Film: Independence Day, n_ocen: 1334
movieID: 1580, Film: Men in Black, n_ocen: 1332
movieID: 6539, Film: Pirates of the Caribbean: The Curse of the Bl

In [96]:
# Ocena za sebe userID 72000 - dodal sem 22 ocen
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(72000, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Forrest Gump, ocena: 4.034740396706799
Film: The Truman Show, ocena: 4.0
Film: Good Will Hunting, ocena: 3.983438108063542
Film: Catch Me If You Can, ocena: 3.9494513640204056
Film: The Sixth Sense, ocena: 3.8815922600753248
Film: The Lord of the Rings: The Return of the King, ocena: 3.863406889010001
Film: The Bourne Identity, ocena: 3.8365011815331016
Film: A Beautiful Mind, ocena: 3.8311711282478096
Film: The Fifth Element, ocena: 3.8238842672836784
Film: Twelve Monkeys, ocena: 3.8199353589590244
Film: Braveheart, ocena: 3.8025183011184
Film: Sin City, ocena: 3.8006577163952864
Film: Shichinin no samurai, ocena: 3.795596149491203
Film: Eternal Sunshine of the Spotless Mind, ocena: 3.7954659150362757
Film: Le fabuleux destin d'Amélie Poulain, ocena: 3.7843849158639498


In [97]:
# Slope one 
class SlopeOnePredictor:
    def __init__(self):
        self.items = None
        self.dev = dict()    # vsota razlik ocen
        self.count = dict()  # št. skupnih uporabnikov
        self.sumDiff = dict()

    def fit(self, X):
        self.items = X
        # self.items.init_user_reviews()  # ni potrebno, če je lazy init

        for userID in self.items.get_user_ids():
            reviews = self.items.get_user_reviews_as_list(userID)
        
            Nreviews = len(reviews)

            for i in range(Nreviews):
                movieID1, review1 = reviews[i]

                if movieID1 not in self.sumDiff:
                    self.sumDiff[movieID1] = dict()
                    self.count[movieID1] = dict()

                for j in range(i + 1, Nreviews):
                    movieID2, review2 = reviews[j]

                    diff1 = review1 - review2
                    diff2 = review2 - review1

                    if movieID2 not in self.sumDiff:
                        self.sumDiff[movieID2] = dict()
                        self.count[movieID2] = dict()

                    self.sumDiff[movieID2].setdefault(movieID1,0)
                    self.count[movieID2].setdefault(movieID1,0)

                    self.sumDiff[movieID1].setdefault(movieID2,0)
                    self.count[movieID1].setdefault(movieID2,0)
                    
                    self.sumDiff[movieID1][movieID2] += diff1 # i - j
                    self.sumDiff[movieID2][movieID1] += diff2 # j - i

                    self.count[movieID2][movieID1] += 1
                    self.count[movieID1][movieID2] += 1

        for movie1 in self.sumDiff:
            self.dev[movie1] = dict()
            for movie2 in self.sumDiff[movie1]:
                c = self.count[movie1].get(movie2, 0)
                if c > 0:
                    self.dev[movie1][movie2] = self.sumDiff[movie1][movie2] / c

                

    def predict(self, userID):
        user_reviews = self.items.get_user_reviews(userID)
        movieIDs = list(self.items.get_movie_ids())
        results = dict()
        
        for movie1 in movieIDs:
            stevc = 0.0
            imenovalc = 0.0

            if movie1 not in self.dev:
                continue

            for movie2, user_rating in user_reviews.items():

                if movie1 == movie2:
                    continue

                if movie2 in self.dev[movie1]:
                    dev = self.dev[movie1][movie2]
                    weight = self.count[movie1][movie2]

                    stevc += (user_rating + dev) * weight
                    imenovalc += weight
                
            if imenovalc > 0:
                results[movie1] = stevc / imenovalc

        return results
        
    def print_sumDiff(self):
        print(self.sumDiff)

    
    def print_count(self):
        print(self.count)


In [98]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: The Usual Suspects, ocena: 4.325079182263173
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.155288741350693
Film: The Lord of the Rings: The Return of the King, ocena: 4.153135076202185
Film: The Silence of the Lambs, ocena: 4.127978169643881
Film: Shichinin no samurai, ocena: 4.119790444913598
Film: The Lord of the Rings: The Two Towers, ocena: 4.083338910827031
Film: Indiana Jones and the Last Crusade, ocena: 3.9670398355464194
Film: The Incredibles, ocena: 3.966502324271024
Film: Good Will Hunting, ocena: 3.963362387354114
Film: Sin City, ocena: 3.942619137615212
Film: Batman Begins, ocena: 3.9374415662981908
Film: A Beautiful Mind, ocena: 3.9140940935239508
Film: Rain Man, ocena: 3.9107819079644943
Film: Monsters, Inc., ocena: 3.882018726266052
Film: Finding Nemo, ocena: 3.8803990323194433


In [99]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, rmse, mae, precision, recall, f = rec.evaluate(uim_test, 20)

===== Evaluation =====
Top-N:      20
MSE:        0.727076
RMSE:       0.852688
MAE:        0.629658
Precision:  0.093935
Recall:     0.127899
F1:         0.087807
