## Priporočilni sistem

Za to seminarsko sem se odločil za imdb dataset, ker sem navajen gledati review-e na njihovi strani, ter želel sem delati nad malo večjo zbirko podatkov.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import random

In [3]:
# To je za osebno testiranje baze
df = pd.read_csv("data/user_ratedmovies.dat", sep="\t")

df_unique_movieID = df.drop_duplicates(subset=["movieID"])
df_unique_movieID

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30
...,...,...,...,...,...,...,...,...,...
850645,71331,62796,3.0,19,11,2008,23,10,40
851696,71420,3585,4.0,22,11,2007,22,43,7
852654,71420,59065,1.5,30,10,2008,15,41,38
853182,71483,4421,2.5,2,9,2003,22,18,25


Dodajanje stolpcev date, ki ga sestavimo iz podatkov in ratings_count, kejer prešejemo število ocen za posamezen film (movieID)

Tle sm meu težave, ker moram naprej sortirati po datumu in potem šele po ratingih

In [4]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=None):
        self.path = path
        self.start_date = pd.to_datetime(start_date, dayfirst=True) if start_date else None
        self.end_date = pd.to_datetime(end_date, dayfirst=True) if end_date else None
        self.min_ratings = min_ratings
        self.df = self.init_df()

    def init_df(self):
        df = pd.read_csv(self.path, sep="\t", encoding="latin1")
        df["date"] = pd.to_datetime( df["date_day"].astype(str) + "." + df["date_month"].astype(str) + "." + df["date_year"].astype(str), format="%d.%m.%Y", dayfirst=True)

        if self.start_date is not None:
            df = df[df["date"] >= self.start_date]

        if self.end_date is not None:
            df = df[df["date"] < self.end_date]

        df["ratings_count"] = df.groupby("movieID")["rating"].transform("count")

        if self.min_ratings is not None:
            df = df[df["ratings_count"] >= self.min_ratings]

        return df

    def nratings(self):
        return len(self.df)
    
    def get_movie_ids(self):
        return self.df["movieID"].unique()
    
    def get_user_ids(self):
        return self.df["userID"].unique()
    
    def sum_ratings_for_movie(self, movieid):
        vs = sum(self.df[self.df["movieID"] == movieid]["rating"])
        return vs
    
    def nratings_for_movie(self, movieid):
        n = self.df[self.df["movieID"] == movieid].shape[0]
        return n
    
    def average_rating(self):
        return sum(self.df["rating"]) / self.nratings()
    
    def movieid_user_has_rated(self, userid):
        res = set(self.df[self.df["userID"] == userid]["movieID"])
        return res
    
    # za ViewsPredictor
    def len_ratings_for_movieid(self, movieid):
        res = len(self.df[self.df["movieID"] == movieid])
        return res
    
    # za STDPredictor
    def ratings_for_movieid(self, movieid):
        res = list(self.df[self.df["movieID"] == movieid]["rating"])
        return res
    
    def get_user_reviews(self,userID): # vrne knjižnico za uporabnika { film: ocena_filma ...}
        user_df = self.df[self.df["userID"] == userID]
        rated = dict()

        # for idx, row in df.iterrows(): samo za naprej
        for _, row in user_df.iterrows():
            rated[row["movieID"]] = row["rating"]

        return rated
    
    def get_movie_reviews(self,movieID): # vrne knjižnico za filem { uporabnik: ocena_filma ...}
        user_df = self.df[self.df["movieID"] == movieID]
        rated = dict()

        # for idx, row in df.iterrows(): samo za naprej
        for _, row in user_df.iterrows():
            rated[row["userID"]] = row["rating"]

        return rated

In [5]:
class MovieData:
    def __init__(self, path):
        self.df = pd.read_csv(path, sep="\t", encoding="latin1")

    def get_title(self, movieID):
        return self.df.loc[self.df['id'] == movieID]["title"].iloc[0]

In [6]:
class RandomPredictor:
    def __init__(self, minRating, maxRating):
        self.minRating = minRating
        self.maxRating = maxRating
        self.items = None
        
    def predict(self, userid):
        myDict = dict()
        for item in self.items:
            myDict[item] = random.randint(self.minRating, self.maxRating)

        return myDict

    def fit(self, X):
        self.items = X.get_movie_ids()

In [8]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 4
Film: Grumpy Old Men, ocena: 4
Film: Money Train, ocena: 4
Film: The Usual Suspects, ocena: 2
Film: City Hall, ocena: 5


In [7]:
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73584


In [8]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))


<class 'dict'>
Film: Toy story, ocena: 2
Film: Grumpy Old Men, ocena: 2
Film: Money Train, ocena: 4
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 3


In [9]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        pred = self.predictor.predict(userID)

        if not rec_seen:
            seen = self.uim.movieid_user_has_rated(userID)
            for movieid in seen:
                if movieid in pred:
                    pred.pop(movieid, None)

        recommends = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:n]

        return recommends

In [10]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
        self.items = None
        self.myDict = dict()


    # vs je vsota vseh ocen za ta film,
    # n je število ocen, ki jih je ta film dobil,
    # g_avg je povprečje čez vse filme,
    # b je parameter formule za povprečje. Če je b=0, gre za navadno povprečje.
    def fit(self, X):
        self.items = X.get_movie_ids()

        g_avg = X.average_rating()

        for ele in self.items:
            n = X.nratings_for_movie(ele)
            vs = X.sum_ratings_for_movie(ele)
            
            avg = (vs + self.b * g_avg) / (n + self.b)
            self.myDict[ele] = avg

    def predict(self, user_id):
        return self.myDict

In [13]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(b=0)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Adam & Steve, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Eve and the Fire Horse, ocena: 5.0


In [14]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(b=100)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [15]:
class ViewsPredictor:
    def __init__(self):
        self.items = None
        self.myDict = dict()

    def fit(self, X):
        self.items = X.get_movie_ids()

        for ele in self.items:
            self.myDict[ele] = X.len_ratings_for_movieid(ele)

    def predict(self, user_id):
        return self.myDict

In [16]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = ViewsPredictor()
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


In [17]:
class STDPredictor:
    def __init__(self, n):
        self.items = None
        self.myDict = dict()
        self.n = n

    def fit(self, X):
        self.items = X.get_movie_ids()

        for ele in self.items:
            n_ele = X.len_ratings_for_movieid(ele)
            if n_ele >= self.n:
                ratings = X.ratings_for_movieid(ele)
                ratings_std = np.std(ratings)
                self.myDict[ele] = ratings_std

    def predict(self, user_id):
        return self.myDict

In [18]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Plan 9 from Outer Space, ocena: 1.3386525190884058
Film: The Passion of the Christ, ocena: 1.2790640858537328
Film: The Texas Chainsaw Massacre, ocena: 1.2306963889088796
Film: Jackass Number Two, ocena: 1.2134235427973514
Film: White Chicks, ocena: 1.184167557494643


In [19]:
len(df["userID"].unique())

2113

In [None]:
uim = UserItemData('data/user_ratedmovies.dat')
test = uim.get_user_reviews(78)
test

468

In [None]:
class ItemBasedPredictor:
    def __init__(self,min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
        self.items = None
        self.similarities = dict()
        self.user_avg = dict()
        # self.id2idx = dict()

    def fit(self, X):
        self.items = X
        movieIDs = X.get_movie_ids()

        for userID in X.get_user_ids():
            reviews = X.get_user_reviews(userID).values()
            self.user_avg[userID] = sum(reviews) / len(reviews)

        for i in range(len(movieIDs)):
            p1 = movieIDs[i]
            if p1 not in self.similarities:
                self.similarities[p1] = dict()

            for j in range(i+1, len(movieIDs)):
                p2 = movieIDs[j]
                sim = self.similarity(p1, p2)

                if sim > 0:
                    self.similarities[p1][p2] = sim

                    # simetrično
                    if p2 not in self.similarities:
                        self.similarities[p2] = dict()
                    self.similarities[p2][p1] = sim

    def predict(self, user_id):
        predictions = dict()
        movieIDs = list(self.items.get_movie_ids())  # set
        rated = self.items.get_user_reviews(user_id)
        u_avg = self.user_avg[user_id]
        
        for movie1 in movieIDs:
            imenovalec = 0
            stevec = 0
            
            if movie1 in self.similarities and movie1 not in rated:
                similaritiesForMovie = self.similarities[movie1]
                
                for movie2, movie12sim in similaritiesForMovie.items():

                    if movie2 not in rated:
                        continue

                    userRating = rated[movie2]
                    imenovalec += movie12sim * (userRating - u_avg)
                    stevec += movie12sim

            if stevec != 0:
                predictions[movie1] = (imenovalec / stevec) + u_avg
            else:
                # predictions[movie1] = 0
                predictions[movie1] = u_avg

        return predictions

    def similarity(self, p1, p2):
        p1_reviews = self.items.get_movie_reviews(p1)
        p2_reviews = self.items.get_movie_reviews(p2)

        presek_userIDs = p1_reviews.keys() & p2_reviews.keys()

        if len(presek_userIDs) < self.min_values:
            return 0

        imenovalc = 0
        a = 0
        b = 0
        for userIDs in presek_userIDs:
            u_avg = self.user_avg[userIDs]

            prvi = p1_reviews[userIDs] - u_avg
            drugi = p2_reviews[userIDs] - u_avg

            imenovalc += prvi * drugi

            a += prvi * prvi
            b += drugi * drugi

        stevc = (a**0.5) * (b**0.5)
        sim = imenovalc / stevc

        if sim < self.threshold:
            return 0

        return sim
    
    # recommends = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:n]

    def get_20_most_similar_movies(self):
        pairs = []

        for movie1 in self.similarities:
            for movie2, sim in self.similarities[movie1].items():
                pairs.append((movie1, movie2, sim))

        pairs.sort(key=lambda x: x[2], reverse=True)
        return pairs[:20]

Tle sem imel še idejo z implementacijo z 2d matriko podobnosti, samo sem ugotovil, da ni najbolj optimalen pristop, ker se shrani tudi informacija, da je podobnost 0. Medtem, ko uporaba dict v dict je ne in je hitreje za iterirati po.

In [None]:
def fit(self, X): 
    self.items = X 
    movieIDs = X.get_movie_ids() 

    for userID in X.get_user_ids(): 
        reviews = X.get_user_reviews(userID).values() 
        self.user_avg[userID] = sum(reviews) / len(reviews) # similarity matrika id2idx = dict() 

        # mapiranje movieID -> index 
        N = len(movieIDs) 
        self.simM = np.zeros((N, N), dtype=float) 
        for i in range(len(movieIDs)): 
            p1 = movieIDs[i] 
            id2idx[movieIDs[i]] = i 

            for j in range(i+1, len(movieIDs)): 
                p2 = movieIDs[j] 
                sim = self.similarity(p1, p2) 

                if sim > 0: 
                    self.simM[i, j] = sim 

    self.simM[j, i] = sim 
    self.id2idx = id2idx 
    
    print(self.simM)

In [61]:
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rated = uim.movieid_user_has_rated(78)

In [62]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395523176756636
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.4246612584468761


In [63]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.3557347903101595
Film: The Usual Suspects, ocena: 4.354681728067836
Film: The Silence of the Lambs, ocena: 4.335305303472516
Film: Sin City, ocena: 4.278687166899101
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.2070985832817485
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348347
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.071535242958551
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.015142490064839
Film: Good Will Hunting, ocena: 4.0092808069228205
Film: The Lord of the Rings: The Two Towers, ocena: 3.9414763050955943
Film: Indiana Jones and the Last Crusade, ocena: 3.796976496378924


In [71]:
rez = rp.get_20_most_similar_movies()

for i in rez:
    print(f"Film1: {md.get_title(i[1])}, Film2: {md.get_title(i[0])}, podobnost: {i[2]}")

Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8231885401761893
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761893
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8079374897442505
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442505
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381035
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381035
Film1: Star Wars: Episod