## Recomendation engine with LLm + RAG

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import random

from sentence_transformers import SentenceTransformer
import faiss




In [2]:
# To je za osebno testiranje baze
df = pd.read_csv("data/user_ratedmovies.dat", sep="\t")

df_unique_movieID = df.drop_duplicates(subset=["movieID"])
df_unique_movieID

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,72000,2571,4.0,20,6,2019,19,17,16
1,72000,4993,4.0,20,6,2019,19,17,16
2,72000,296,3.5,20,6,2019,19,17,16
3,72000,5952,4.0,20,6,2019,19,17,16
4,72000,480,3.0,20,6,2019,19,17,16
...,...,...,...,...,...,...,...,...,...
850668,71331,62796,3.0,19,11,2008,23,10,40
851719,71420,3585,4.0,22,11,2007,22,43,7
852677,71420,59065,1.5,30,10,2008,15,41,38
853205,71483,4421,2.5,2,9,2003,22,18,25


Dodajanje stolpcev date, ki ga sestavimo iz podatkov in ratings_count, kjer prešejemo število ocen za posamezen film (movieID)

Tle sm meu težave, ker moram naprej sortirati po datumu in potem šele po ratingih

In [3]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=None):
        self.path = path
        self.start_date = pd.to_datetime(start_date, dayfirst=True) if start_date else None
        self.end_date = pd.to_datetime(end_date, dayfirst=True) if end_date else None
        self.min_ratings = min_ratings
        self.df = self.init_df()
        self.users_reviews = dict()
        self.movie_reviews = dict()
        self.users_reviews_list = dict()

    def init_df(self):
        df = pd.read_csv(self.path, sep="\t", encoding="latin1")
        df["date"] = pd.to_datetime( df["date_day"].astype(str) + "." + df["date_month"].astype(str) + "." + df["date_year"].astype(str), format="%d.%m.%Y", dayfirst=True)

        if self.start_date is not None:
            df = df[df["date"] >= self.start_date]

        if self.end_date is not None:
            df = df[df["date"] < self.end_date]

        df["ratings_count"] = df.groupby("movieID")["rating"].transform("count")

        if self.min_ratings is not None:
            df = df[df["ratings_count"] >= self.min_ratings]

        return df

    def nratings(self):
        return len(self.df)
    
    def nusers(self):
        return len(self.df["userID"].unique())
    
    def nitems(self):
        return len(self.df["movieID"].unique())
    
    def get_movie_ids(self):
        return self.df["movieID"].unique()
    
    def get_user_ids(self):
        return self.df["userID"].unique()
    
    def sum_ratings_for_movie(self, movieid):
        vs = sum(self.df[self.df["movieID"] == movieid]["rating"])
        return vs
    
    def nratings_for_movie(self, movieid):
        n = self.df[self.df["movieID"] == movieid].shape[0]
        return n
    
    def average_rating(self):
        return sum(self.df["rating"]) / self.nratings()
    
    def movieid_user_has_rated(self, userid):
        res = set(self.df[self.df["userID"] == userid]["movieID"])
        return res
    
    # za ViewsPredictor
    def len_ratings_for_movieid(self, movieid):
        res = len(self.df[self.df["movieID"] == movieid])
        return res
    
    # za STDPredictor
    def ratings_for_movieid(self, movieid):
        res = list(self.df[self.df["movieID"] == movieid]["rating"])
        return res
    
    def get_user_reviews(self, userID):
        if not self.users_reviews:
            self.init_user_reviews()

        return self.users_reviews.get(userID, dict())
    
    # { userID : [(ocena1, film1), (ocena2, film2)...] }
    def init_user_reviews(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = zip(movieIDsForUser, ratingsForUser)

            self.users_reviews[userID] = dict(pairs_list)

    def get_user_reviews_as_list(self, userID):
        if not self.users_reviews_list:
            self.init_user_reviews_as_list()

        return self.users_reviews_list.get(userID, [])

    def init_user_reviews_as_list(self):
        for userID, user_df in self.df.groupby("userID", sort=False):
            ratingsForUser = user_df["rating"].to_list()
            movieIDsForUser = user_df["movieID"].to_list()

            pairs_list = list(zip(movieIDsForUser, ratingsForUser))

            self.users_reviews_list[userID] = list(pairs_list)

    def init_movie_reviews(self):
        for movieID, movie_df in self.df.groupby("movieID", sort=False):
            ratingsForMovie = movie_df["rating"].to_list()
            userIDsForMovie = movie_df["userID"].to_list()

            pairs = zip(userIDsForMovie, ratingsForMovie)

            self.movie_reviews[movieID] = dict(pairs)

    def get_review_for_user(self, movieID, userID):
        return self.get_user_reviews(userID).get(movieID)
    
    def get_movie_reviews(self, movieID):
        if not self.movie_reviews:
            self.init_movie_reviews()

        return self.movie_reviews.get(movieID, dict())
    
    def most_popular_shows(self, n=10):
        most_pop = []

        for movie in self.df["movieID"].unique():
            cnt = self.len_ratings_for_movieid(movie)
            most_pop.append((movie, cnt))

        most_pop.sort(key=lambda x: x[1], reverse=True)
        return most_pop
    
    # k = število splitov
    def split_data_for_cross(self, k, seed):
        shuffled_df = self.df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
        split_dfs = np.array_split(shuffled_df, k)
        return split_dfs
    
    def split_data_for_increment(self, k, seed=None):
        sorted_df = self.df.sort_values("date").reset_index(drop=True)
        split_dfs = np.array_split(sorted_df, k)
        return split_dfs

    # Da lahko direktno vstavljam df v uim brez __init__
    @classmethod
    def from_df(cls,df):
        obj = cls.__new__(cls) 
        obj.df = df

        obj.users_reviews = {}
        obj.movie_reviews = {}
        obj.users_reviews_list = {}
        obj.path = None
        obj.start_date = None
        obj.end_date = None
        obj.min_ratings = None

        return obj

    # Methods for MF with NN
    def split_data(self):
        train, test = train_test_split(self.df, test_size=0.2, random_state=42)
        return train, test

In [4]:
class MovieData:
    def __init__(self, path):
        self.df = pd.read_csv(path, sep="\t", encoding="latin1")

    def get_title(self, movieID):
        return self.df.loc[self.df['id'] == movieID]["title"].iloc[0]

In [5]:
class Recommender:
    def __init__(self, predictor):
        self.predictor = predictor
        self.uim = None

    def fit(self, X):
        self.uim = X
        self.predictor.fit(X)

    def recommend(self, userID, n=10, rec_seen=True):
        pred = self.predictor.predict(userID)

        if not rec_seen:
            seen = self.uim.movieid_user_has_rated(userID)
            for movieid in seen:
                if movieid in pred:
                    pred.pop(movieid, None)

        recommends = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:n]

        return recommends
    
    def evaluate(self, test_data, top_n=20, printing=True):
        #userIDs = test_data.get_user_ids()
        #userIDs = set(test_data.get_user_ids()) & set(self.uim.get_user_ids())
        train_data = self.uim
        # Da uporabnike vedno obravnavam ob istem vrstnem redu
        userIDs = sorted(set(test_data.get_user_ids()) & set(train_data.get_user_ids()))

        count = 0
        stevc = 0.0
        sum_abs = 0.0
        
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        n_users = 0

        for user in userIDs:
            # MSE
            test_reviews = test_data.get_user_reviews(user) # {movieID: true_rating}
            if not test_reviews:
                continue

            pred = self.predictor # {movieID: predicted_rating}
            pred = pred.predict(user)

            for movieID, true_rating in test_reviews.items():
                if movieID not in pred:
                    continue

                error = pred[movieID] - true_rating
                stevc += ((error) ** 2)
                sum_abs += abs(error)
                count += 1

            # precision, recall, F1
            seen = train_data.movieid_user_has_rated(user)
            user_avg = sum(test_reviews.values()) / len(test_reviews)

            candidates = dict()
            relevant = set()
            recommends = set()

            for movieID, true_rating in test_reviews.items():
                if true_rating > user_avg:
                    relevant.add(movieID)

            # če nima relevant filmov, nima smisla računat recall/F1 za tega userja
            if len(relevant) == 0:
                continue

            for movieID, pred_rating in pred.items():
                if movieID not in seen:
                    candidates[movieID] = pred_rating

            # recommends_list = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:top_n]
            # če ima movieID isto oceno x[1] sortiri po movieID x[0]
            recommends_list = sorted(candidates.items(), key=lambda x: (-x[1], x[0]))[:top_n]
            
            for movieID, _ in recommends_list:
                recommends.add(movieID)

            tp = len(recommends & relevant)

            if len(recommends) > 0:
                precision = tp / len(recommends)
            else:
                precision = 0.0

            recall = tp / len(relevant)

            if (precision + recall) > 0:
                f1 = (2 * precision * recall) / (precision + recall)
            else:
                f1 = 0.0

            sum_precision += precision
            sum_recall += recall
            sum_f1 += f1
            n_users += 1

        # climax
        if count == 0:
            mse = 0.0
            mae = 0.0
            rmse = 0.0
        else:
            mse = stevc / count
            mae = sum_abs / count
            rmse = mse ** 0.5
            
        if n_users == 0:
            avg_precision = 0.0
            avg_recall = 0.0
            avg_f1 = 0.0
        else:
            avg_precision = sum_precision / n_users
            avg_recall = sum_recall / n_users
            avg_f1 = sum_f1 / n_users

        if printing:
            print("===== Evaluation =====")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {mse:.6f}")
            print(f"RMSE:       {rmse:.6f}")
            print(f"MAE:        {mae:.6f}")
            print(f"Precision:  {avg_precision:.6f}")
            print(f"Recall:     {avg_recall:.6f}")
            print(f"F1:         {avg_f1:.6f}")
            print("======================")
        return mse, rmse, mae, avg_precision, avg_recall, avg_f1
    
    def incremental_testing(self, uim_full, predictor, top_n=20, k=10, seed=42, printing=True):
        dfs = uim_full.split_data_for_increment(k, seed)
        sum_mse = 0.0
        sum_rmse = 0.0
        sum_mae = 0.0
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        count = 0

        for i in range(1, len(dfs)):
            # Izvajamo teste za vsak df
            test_df = dfs[i]
            train_dfs = dfs[:i]

            train_df = pd.concat(train_dfs)

            uim_test = UserItemData.from_df(test_df)
            uim_train = UserItemData.from_df(train_df)

            rec = Recommender(predictor())
            rec.fit(uim_train)

            mse_i, rmse_i, mae_i, p_i, r_i, f1_i = rec.evaluate(uim_test, top_n, False)

            sum_mse += mse_i
            sum_rmse += rmse_i
            sum_mae += mae_i
            sum_precision += p_i
            sum_recall += r_i
            sum_f1 += f1_i
            count += 1

        if count == 0:
            return (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        sum_mse /= count
        sum_rmse /= count
        sum_mae /= count
        sum_precision /= count
        sum_recall /= count
        sum_f1 /= count

        if printing:
            print("== INCREMENT TESTING ==")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {sum_mse:.6f}")
            print(f"RMSE:       {sum_rmse:.6f}")
            print(f"MAE:        {sum_mae:.6f}")
            print(f"Precision:  {sum_precision:.6f}")
            print(f"Recall:     {sum_recall:.6f}")
            print(f"F1:         {sum_f1:.6f}")
            print("=======================")
    
        return sum_mse, sum_rmse, sum_mae, sum_precision, sum_recall, sum_f1
    

    def cross_validation(self, uim_full, predictor, top_n=20, k=10, seed=42, printing=True):
        dfs = uim_full.split_data_for_cross(k, seed)
        sum_mse = 0.0
        sum_rmse = 0.0
        sum_mae = 0.0
        sum_precision = 0.0
        sum_recall = 0.0
        sum_f1 = 0.0
        # remember k je count

        for i in range(k):
            # Izvajamo teste za vsak df
            test_df = dfs[i]

            train_dfs = []

            for j in range(k):
                if j == i:
                    continue
                train_dfs.append(dfs[j])

            train_df = pd.concat(train_dfs)

            uim_test = UserItemData.from_df(test_df)
            uim_train = UserItemData.from_df(train_df)

            rec = Recommender(predictor())
            rec.fit(uim_train)

            mse_i, rmse_i, mae_i, p_i, r_i, f1_i = rec.evaluate(uim_test, top_n, False)

            sum_mse += mse_i
            sum_rmse += rmse_i
            sum_mae += mae_i
            sum_precision += p_i
            sum_recall += r_i
            sum_f1 += f1_i

        sum_mse /= k
        sum_rmse /= k
        sum_mae /= k
        sum_precision /= k
        sum_recall /= k
        sum_f1 /= k

        if printing:
            print("== CROSS VALIDATION ==")
            print(f"Top-N:      {top_n}")
            print(f"MSE:        {sum_mse:.6f}")
            print(f"RMSE:       {sum_rmse:.6f}")
            print(f"MAE:        {sum_mae:.6f}")
            print(f"Precision:  {sum_precision:.6f}")
            print(f"Recall:     {sum_recall:.6f}")
            print(f"F1:         {sum_f1:.6f}")
            print("======================")
    
        return sum_mse, sum_rmse, sum_mae, sum_precision, sum_recall, sum_f1
