In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import random

from typing import Any, Dict, List, Tuple, Union
from pathlib import Path

from recommend.utils import PROJ_ROOT

# Utility functions

In [2]:
def creators2list(creators : str, top_n_actors : int = 3) -> List[str]:
    """Returns a list of `top_n_actors`, director, and a composer."""
    result = []
    
    if 'Hrají' in creators:
        result = creators['Hrají'][:top_n_actors]
        result = list(map(lambda x: x[1], result))

    if 'Režie' in creators:
        result += [creators['Režie'][0][1]]
    if 'Hudba' in creators:
        result += [creators['Hudba'][0][1]]
    return result


def analyse_freq(list_of_lists : List[List[Any]]) -> List[Tuple[Any, int]]:
    """
    Counts occurences of items in a list of lists
    and returns them in descending order as tuples <item, count>."""

    flat_list = [item for sublist in list_of_lists for item in sublist]
    counter = collections.Counter(flat_list)
    return counter.most_common(len(counter))


def get_value(df : pd.DataFrame, movie_id : str, col : str) -> Any:
    """Retrieves a value in a cell specified by `col` for movie with ID `movie_id`"""
    return df.loc[df['movie_id'] == m_id][col].values[0]

def ids2df(ids : List[str], df : pd.DataFrame) -> pd.DataFrame:
    """Converts list of movie_ids to a dataframe while preserving order."""
    # faster but does not preserve order
    #mask = df['movie_id'].isin(ids)
    #return df.loc[mask]

    df_list = []
    for _id in ids:
        df_list.append(df[df['movie_id'] == _id])
    return pd.concat(df_list)

### Czech stemmer

In [3]:
#! /usr/bin/env python3.1
''' Czech stemmer
Copyright © 2010 Luís Gomes <luismsgomes@gmail.com>.

Ported from the Java implementation available at:
    http://members.unine.ch/jacques.savoy/clef/index.html

'''
import re
import sys



def cz_stem(word, aggressive=False):
    if not re.match("^\\w+$", word):
        return word
    if not word.islower() and not word.istitle() and not word.isupper():
        #print("warning: skipping word with mixed case: {}".format(word),
              #file=sys.stderr)
        return word
    s = word.lower() # all our pattern matching is done in lowercase
    s = _remove_case(s)
    s = _remove_possessives(s)
    if aggressive:
        s = _remove_comparative(s)
        s = _remove_diminutive(s)
        s = _remove_augmentative(s)
        s = _remove_derivational(s)
    if word.isupper():
        return s.upper()
    if word.istitle():
        return s.title()
    return s

def _remove_case(word):
    if len(word) > 7 and word.endswith("atech"):
        return word[:-5]
    if len(word) > 6:
        if word.endswith("ětem"):
            return _palatalise(word[:-3])
        if word.endswith("atům"):
            return word[:-4]
    if len(word) > 5:
        if word[-3:] in {"ech", "ich", "ích", "ého", "ěmi", "emi", "ému",
                         "ete", "eti", "iho", "ího", "ími", "imu"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"ách", "ata", "aty", "ých", "ama", "ami",
                         "ové", "ovi", "ými"}:
            return word[:-3]
    if len(word) > 4:
        if word.endswith("em"):
            return _palatalise(word[:-1])
        if word[-2:] in {"es", "ém", "ím"}:
            return _palatalise(word[:-2])
        if word[-2:] in {"ům", "at", "ám", "os", "us", "ým", "mi", "ou"}:
            return word[:-2]
    if len(word) > 3:
        if word[-1] in "eiíě":
            return _palatalise(word)
        if word[-1] in "uyůaoáéý":
            return word[:-1]
    return word

def _remove_possessives(word):
    if len(word) > 5:
        if word[-2:] in {"ov", "ův"}:
            return word[:-2]
        if word.endswith("in"):
            return _palatalise(word[:-1])
    return word

def _remove_comparative(word):
    if len(word) > 5:
        if word[-3:] in {"ejš", "ějš"}:
            return _palatalise(word[:-2])
    return word

def _remove_diminutive(word):
    if len(word) > 7 and word.endswith("oušek"):
        return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"eček", "éček", "iček", "íček", "enek", "ének",
                         "inek", "ínek"}:
            return _palatalise(word[:-3])
        if word[-4:] in {"áček", "aček", "oček", "uček", "anek", "onek",
                         "unek", "ánek"}:
            return _palatalise(word[:-4])
    if len(word) > 5:
        if word[-3:] in {"ečk", "éčk", "ičk", "íčk", "enk", "énk",
                         "ink", "ínk"}:
            return _palatalise(word[:-3])
        if word[-3:] in {"áčk", "ačk", "očk", "učk", "ank", "onk",
                         "unk", "átk", "ánk", "ušk"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"ek", "ék", "ík", "ik"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ák", "ak", "ok", "uk"}:
            return word[:-1]
    if len(word) > 3 and word[-1] == "k":
        return word[:-1]
    return word

def _remove_augmentative(word):
    if len(word) > 6 and word.endswith("ajzn"):
        return word[:-4]
    if len(word) > 5 and word[-3:] in {"izn", "isk"}:
        return _palatalise(word[:-2])
    if len(word) > 4 and word.endswith("ák"):
        return word[:-2]
    return word

def _remove_derivational(word):
    if len(word) > 8 and word.endswith("obinec"):
        return word[:-6]
    if len(word) > 7:
        if word.endswith("ionář"):
            return _palatalise(word[:-4])
        if word[-5:] in {"ovisk", "ovstv", "ovišt", "ovník"}:
            return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"ásek", "loun", "nost", "teln", "ovec", "ovík",
                         "ovtv", "ovin", "štin"}:
            return word[:-4]
        if word[-4:] in {"enic", "inec", "itel"}:
            return _palatalise(word[:-3])
    if len(word) > 5:
        if word.endswith("árn"):
            return word[:-3]
        if word[-3:] in {"ěnk", "ián", "ist", "isk", "išt", "itb", "írn"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"och", "ost", "ovn", "oun", "out", "ouš",
                         "ušk", "kyn", "čan", "kář", "néř", "ník",
                         "ctv", "stv"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"áč", "ač", "án", "an", "ář", "as"}:
            return word[:-2]
        if word[-2:] in {"ec", "en", "ěn", "éř", "íř", "ic", "in", "ín",
                         "it", "iv"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn",
                         "dl", "nk", "tv", "tk", "vk"}:
            return word[:-2]
    if len(word) > 3 and word[-1] in "cčklnt":
        return word[:-1]
    return word

def _palatalise(word):
    if word[-2:] in {"ci", "ce", "či", "če"}:
        return word[:-2] + "k"

    if word[-2:] in {"zi", "ze", "ži", "že"}:
        return word[:-2] + "h"

    if word[-3:] in {"čtě", "čti", "čtí"}:
        return word[:-3] + "ck"

    if word[-3:] in {"ště", "šti", "ští"}:
        return word[:-3] + "sk"
    return word[:-1]

"""
if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"):
        sys.exit("usage: {} light|aggressive".format(sys.argv[0]))
    aggressive = sys.argv[1] == "aggressive"
    for line in sys.stdin:
        print(*[cz_stem(word, aggressive=aggressive)
                for word in line.split()])
"""
pass

# Load datasets

In [4]:
movies = pd.read_pickle(PROJ_ROOT / 'data' / 'movies.pkl')
ratings = pd.read_pickle(PROJ_ROOT / 'data' / 'ratings.pkl')

# Drop irrelevant columns
movies = movies.drop(columns=['kind', 'length', 'poster', 'foreign_titles'])

# Convert creators column to list of creators
movies['creators'] = movies['creators'].apply(creators2list)
#movies.reset_index(inplace=True) # causes trouble with join

display(movies.head(3))
ratings.head(3)

Unnamed: 0_level_0,title,description,genres,countries,year,creators
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
230421-houbicky,Houbičky,"Partička amerických teenagerů, která si vyrazi...","[Horor, Thriller]","[Irsko, Velká Británie, Dánsko]",2007,"[Lindsey Haun, Jack Huston, Max Kasch, Paddy B..."
10789-prvni-liga,První liga,V nejvyšší lize získávají hráči baseballu boha...,"[Komedie, Sportovní]",[USA],1989,"[Tom Berenger, Charlie Sheen, Corbin Bernsen, ..."
235032-yes-man,Yes Man,"Carl Allen je zatrpklý bankovní úředník, který...","[Komedie, Romantický]",[USA],2008,"[Jim Carrey, Zooey Deschanel, Bradley Cooper, ..."


Unnamed: 0,username,movie_id,stars,date,comment
0,kinghome,230421-houbicky,5.0,2011-11-13,Hodnocení některých šašků tady opravdu necháp...
1,SimonShot,230421-houbicky,5.0,2012-12-19,Tento snímek je zajímavý už jenom tím že se n...
2,blackend,230421-houbicky,5.0,2010-09-04,Pohoda a vzhledem k dobré atmosféře a nízkému...


In [5]:
# get mean rating of each film 
avg_ratings = ratings[['movie_id', 'stars']].groupby(['movie_id']).mean()
avg_ratings.rename(columns={'stars': 'avg_stars'}, inplace=True)

# get rating count of each film
count_ratings = ratings[['movie_id', 'stars']].groupby('movie_id').count()
count_ratings.rename(columns={'stars': 'num_ratings'}, inplace=True)
count_ratings

# join with the movies database
movie_db = movies.join(avg_ratings).join(count_ratings).sort_values(by='num_ratings')
movie_db = movie_db.reset_index()
movie_db



# Compute Bayesian Average Rating Score -> 4.6* movie with many reviews > 5.0* movie with few reviews

def compute_bayes_avg(item_rating_avg : float, item_rating_count : int, M, C) -> float:
    """Computes Bayesian average rating for an item"""
    return (item_rating_avg * item_rating_count + C*M) / (item_rating_count + C)

# arithmetic average of all movies
M = np.sum(movie_db.avg_stars * movie_db.num_ratings) / np.sum(movie_db.num_ratings)     
# confidence number (~25% percentile)
C = movie_db.num_ratings.quantile(0.25)

movie_db['bayes_avg_rating'] = compute_bayes_avg(movie_db.avg_stars, movie_db.num_ratings, M, C)

movie_db

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings,bayes_avg_rating
0,75613-hercule-poirot_498504-serie-9,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",5.000000,1,3.501384
1,613789-obeti,Oběti,Na televizní obrazovku se vrací cyklus o lide...,[Drama],[Česko],(1999–2008),"[Zbyněk Fric, Karel Zima, Libor Žídek, Petr Sl...",5.000000,1,3.501384
2,75613-hercule-poirot_498507-serie-12,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",4.500000,2,3.506088
3,350930-krtek,Krtek,,"[Animovaný, Dobrodružný]","[Česko, Finsko]",2011,"[Zdeněk Miler, Wiliam Bukový]",4.500000,2,3.506088
4,33863-bajaja,Bajaja,Jiří Trnka natočil v roce 1950 volně podle poh...,"[Animovaný, Loutkový, Pohádka]",[Československo],1950,"[Jiří Trnka, Václav Trojan]",4.285714,14,3.581740
...,...,...,...,...,...,...,...,...,...,...
8669,232938-hobit-neocekavana-cesta,Hobit: Neočekávaná cesta,Film sleduje cestu hlavní postavy Bilbo Pytlík...,"[Dobrodružný, Fantasy]","[USA, Nový Zéland]",2012,"[Martin Freeman, Ian McKellen, Richard Armitag...",3.953150,3159,3.938292
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174,4.054193
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770,4.530368
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368000,4000,4.345674



# Recommendation Systems
---
## Base class

In [6]:
import abc

class RecSysBase(abc.ABC):
    """
    Recommendation System Base class.
    
    The system should store a database of available movies.
    
    """
    def __init__(self, movies_df : pd.DataFrame):
        self.movies_df = movies_df
    
    @property
    def name(self):
        return self.__class__.__name__

    @abc.abstractmethod
    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        """
        Generates recommendations based on user's history of rated movies.
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie ids a user has rated so far.
            
        n : int
            The number of recommendations to return.
            Default value None returns all movies from database.
        
        Returns
        -------
        pd.DataFrame
            movie_id ~ movie ID
            score_<class_name> ~ score of each movie
        """
        ...

    def get_candidate_movies(self, user_history : List[str]) -> pd.DataFrame:
        """Returns a list of candidates by filtering movies a user has already seen."""
        df = pd.concat([self.movies_df, self._ids2df(user_history, preserve_order=False)])
        # convert to str because drop_duplicates does not work on dfs containing structures
        return df.loc[df.astype(str).drop_duplicates(keep=False).index]

    def _ids2df(self, movie_ids : List[str], preserve_order : bool = True) -> pd.DataFrame:
        """For a list of movie ids returns a dataframe compatible with `self.movies_df`"""        
        if not preserve_order or len(movie_ids) == 0:
            # faster variant but sorted as in self.movies_df 
            mask = self.movies_df['movie_id'].isin(movie_ids)
            return self.movies_df.loc[mask]

        df_list = []
        for mid in movie_ids:
            df_list.append(self.movies_df[self.movies_df['movie_id'] == mid])
        return pd.concat(df_list)


## Random recommendations

In [7]:
import random

class RandomRS(RecSysBase):
    """
    System that recommends movies randomly.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(RandomRS, self).__init__(movies_df)

    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        """
        Recommends n random movies.
        Movies from `user_history` do not appear in the recommendations.
        """
        candidates = self.get_candidate_movies(user_history)
        recommendations = candidates.sample(n) if type(n) is int else candidates.sample(frac=1)
        recommendations = recommendations[['movie_id']]
        recommendations[f'score_{self.name}'] = 0 #np.linspace(start=1, stop=0, num=len(recommendations))
        return recommendations
    


In [8]:
rnd_rs = RandomRS(movie_db)

rnd_rs.recommend([])

Unnamed: 0,movie_id,score_RandomRS
1948,542676-destivy-den-v-new-yorku,0
7318,904-kytice,0
5127,350037-osmy-smysl,0
3936,203794-avatar-legenda-o-aangovi,0
6456,8568-solaris,0
...,...,...
4204,1782-crash,0
4548,349130-az-po-usi,0
3553,20841-rek-zorba,0
3020,233786-hrdinove-valky,0


## Recommendation based on movie popularity

In [9]:

class TopNRS(RecSysBase):
    """
    System recommends top n most popular movies a user has not rated yet.
    """
    @abc.abstractmethod
    def __init__(self, movies_df : pd.DataFrame):#, by_column : str):
        super(TopNRS, self).__init__(movies_df)
        #self.by_column = by_column
        
    '''
    def set_by_column(self, new_column_name : str):
        """Changes the column which is used as a metric to sort the movies"""
        if new_column_name not in self.movies_df:
            raise AttributeError(f'Cannot use {new_column_name} to sort movies. Such column does not exist in the data frame.')
        self.by_column = new_column_name
    '''

    # abstract because subclasses need to define self.by_column
    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        """
        Recommends top n movies (a user has not yet seen) which are sorted by a selected column.
        """
        recommendations = self.get_candidate_movies(user_history)\
                                .sort_values(by=self.sort_by_column, ascending=False)\
                                .head(n)
        results = recommendations[['movie_id']].copy()
        results[f'score_{self.name}_{self.sort_by_column}'] = recommendations[self.sort_by_column]
        return results

    
class MostRatingsRS(TopNRS):
    def __init__(self, movies_df : pd.DataFrame):
        super(MostRatingsRS, self).__init__(movies_df)
        self.sort_by_column = 'num_ratings'
    
class TopBayesAvgRatingRS(TopNRS):
    def __init__(self, movies_df : pd.DataFrame):
        super(TopBayesAvgRatingRS, self).__init__(movies_df)
        self.sort_by_column = 'bayes_avg_rating'


In [10]:
history = ['228329-avatar']

rs_num_ratings = MostRatingsRS(movie_db)
rs_num_ratings.recommend(history)

Unnamed: 0,movie_id,score_MostRatingsRS_num_ratings
8672,254156-pocatek,4000
8671,223734-temny-rytir,3770
8670,227786-interstellar,3174
8669,232938-hobit-neocekavana-cesta,3159
8668,294824-nespoutany-django,3129
...,...,...
4,33863-bajaja,14
2,75613-hercule-poirot_498507-serie-12,2
3,350930-krtek,2
1,613789-obeti,1


In [11]:
rs_bayes_avg = TopBayesAvgRatingRS(movie_db)
rs_bayes_avg.recommend([])

Unnamed: 0,movie_id,score_TopBayesAvgRatingRS_bayes_avg_rating
8667,2294-vykoupeni-z-veznice-shawshank,4.724851
8466,821-s-certy-nejsou-zerty,4.710589
8595,1248-terminator-2-den-zuctovani,4.706829
8663,10135-forrest-gump,4.690586
8546,6178-dvanact-rozhnevanych-muzu,4.689629
...,...,...
7059,8597-jak-ukrast-dagmaru,0.949324
7746,289279-tisic-a-jedna-noc,0.895201
6759,226809-velmi-krehke-vztahy,0.857553
8339,215294-ordinace-v-ruzove-zahrade,0.829809


# Gensim models: TFIDF, LSI

In [12]:

class GensimModelWrapper(RecSysBase):
    """
    Base class for gensim based models.
    Implements methods for document preprocessing.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(GensimModelWrapper, self).__init__(movies_df)
    
    @abc.abstractmethod
    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        ...
    
    @classmethod
    def _document_to_tokens(cls, document: str) -> List[str]:
        """
        Filters stop words and applies stemming on the given string.

        Parameters
        ----------
        text : str
            A simple string containing text to preprocess.
        """
        tokens = word_tokenize(document.lower(), language='czech')
        tokens = [w.strip(",.:'?!()[]{}<>_-") for w in tokens]
        tokens = [w for w in tokens if w not in set(stop_words.get_stop_words('czech'))]
        tokens = [cz_stem(word) for word in tokens]
        tokens = [w for w in tokens if len(w) > 2]
        return tokens
    
    @classmethod
    def _document_to_bag_of_words(cls, document: str) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(cls._document_to_tokens(document))
    
    @classmethod
    def _tokens_to_bag_of_words(cls, tokens: List[str]) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(tokens)
    

## TF-IDF Base Model

In [13]:
import stop_words
from nltk.tokenize import word_tokenize
from multiprocessing import get_context, Pool
from gensim.corpora import Dictionary
from gensim.matutils import cossim
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from gensim.utils import simple_preprocess
from tqdm import tqdm


class TfidfBaseRS(GensimModelWrapper):
    """
    TODO: finish & describewhat to do with subclasses: System recommends unseen movies using TF-IDF 
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfBaseRS, self).__init__(movies_df)
        
        # TfidfRS subclasses need to implement what a document is
        documents = self.extract_documents(movies_df)
        
        #TODO: can we move the following functionality to GensimModelWrapper??
        # Build Dictionary
        with get_context('fork').Pool(None) as pool:
            doc_tokens = pool.imap(self.__class__._document_to_tokens, documents)
            doc_tokens = tqdm(doc_tokens, desc='Building the dictionary', total=len(documents))
            self.dictionary = Dictionary(doc_tokens)
            self.__class__.DICTIONARY = self.dictionary
        
        # Build TF-IDF model
        with get_context('fork').Pool(None) as pool:
            doc_bows = pool.imap(self.__class__._document_to_bag_of_words, documents)
            doc_bows = tqdm(doc_bows, desc='Building the TF-IDF model', total=len(documents))
            self.tfidf_model = TfidfModel(doc_bows)
            self.__class__.TFIDF_MODEL = self.tfidf_model

        # Build the index
        with get_context('fork').Pool(None) as pool:    
            doc_vectors = pool.imap(self.__class__._document_to_tfidf_vector, documents)
            doc_vectors = tqdm(doc_vectors, desc=f'Building the TF-IDF index', total=len(documents))
            self.index = SparseMatrixSimilarity(doc_vectors, num_docs=len(documents), num_terms=len(self.dictionary))
            
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL 

        self.index_to_movie_id = dict(enumerate(self.movies_df.movie_id.to_list()))

    @abc.abstractstaticmethod
    def extract_documents(movies_df : pd.DataFrame) -> List[str]:
        """Method extracts list of documents from movie_db"""
        ...
    
    def get_document_for_movie_id(self, movie_id : str) -> str:
        """Returns a document for movie with provided ID"""
        return self.extract_documents(self.movies_df.loc[self.movies_df['movie_id'] == movie_id])[0]
    
    """
    # TODO: basically only used to cteate the query
    def get_description_with_id(self, movie_id : str) -> str:
        '''Returns a document for movie with provided ID'''
        #TODO: get rid of or put it into a baseclass
        #TODO: handle case when no such movie exists
        return self.movies_df.loc[self.movies_df['movie_id'] == movie_id].description.values[0]
    """
    
    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        """Recommends unseen movies using tfidf vectors and cosine similarity.
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie IDs
        
        n : int
            Recommends n movies, if set to an integer.
            Default value None returns all the movies.
        """
        # Take last k movies, join them into a single description and create a query.
        k = 50 # TODO: should this be a configurable parameter? should it be tuned?
        last_k_movies = user_history[-k:]
        query = ' '.join([self.get_document_for_movie_id(mid) for mid in last_k_movies])
        
        self.__class__.DICTIONARY = self.dictionary
        self.__class__.TFIDF_MODEL = self.tfidf_model

        query_vector = self.__class__._document_to_tfidf_vector(query)
        similarities = enumerate(self.index[query_vector])
        similarities = sorted(similarities, key=lambda item: item[1], reverse=True)

        result_ids, scores = [], []
        for idx, sim in similarities:
            movie_id = self.index_to_movie_id[idx]
            if movie_id not in user_history:
                result_ids.append(movie_id)
                scores.append(sim)
            
            if len(result_ids) == n:
                break
        
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL
        
        return pd.DataFrame({
            'movie_id': result_ids,
            f'score_{self.name}': scores
        })

    @classmethod
    def _document_to_tfidf_vector(cls, document: str) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[cls._document_to_bag_of_words(document)]
    
    @classmethod
    def _bow_to_tfidf_vector(cls, bow: List[Tuple[int, int]]) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[bow]

    

## TF-IDF recommender systems

In [14]:
class TfidfTitleRS(TfidfBaseRS):
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfTitleRS, self).__init__(movies_df)

    @staticmethod
    def extract_documents(movies_df : pd.DataFrame) -> List[str]:
        """Method extracts list of documents from movie_db"""
        return list(movies_df.title.astype(str))

    
class TfidfDescriptionTitleRS(TfidfBaseRS):
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfDescriptionTitleRS, self).__init__(movies_df)

    @staticmethod
    def extract_documents(movies_df : pd.DataFrame) -> List[str]:
        """Method extracts list of documents from movie_db"""
        return list(movies_df.title.astype(str) + ' ' + movies_df.description)


class TfidfGenresRS(TfidfBaseRS):
    # TODO: inverse frequency does not work well the way I use it atm.
    # If user sees many Action movies and a single Animated movie then he is presented with tons of Animated movies :}
    # Maybe change how query is created or play with 
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfGenresRS, self).__init__(movies_df)

    @staticmethod
    def extract_documents(movies_df : pd.DataFrame) -> List[str]:
        """Method extracts list of documents from movie_db"""
        return [' '.join(genre_list) for genre_list in movies_df.genres]


    @classmethod
    def _document_to_tokens(cls, document: str) -> List[str]:
        """
        Filters stop words and applies stemming on the given string.

        Parameters
        ----------
        text : str
            A simple string containing text to preprocess.
        """
        return word_tokenize(document.lower(), language='czech')

In [15]:
rs_tfidf_descr_title = TfidfDescriptionTitleRS(movie_db)

Building the dictionary: 100%|██████████| 8674/8674 [00:05<00:00, 1708.98it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:07<00:00, 1193.68it/s]
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:10<00:00, 847.08it/s] 


In [16]:
history = ['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']
rs_tfidf_descr_title.recommend(history, n=15)

Unnamed: 0,movie_id,score_TfidfDescriptionTitleRS
0,1646-kmotr-iii,0.312191
1,1643-kmotr-1-2-3,0.273426
2,392898-lego-r-batman-film,0.167242
3,6-maffiosso,0.153228
4,9498-matrix-revolutions,0.152167
5,266796-liga-spravedlivych-krize-na-dvou-zemich,0.141337
6,223734-temny-rytir,0.134347
7,1069-batman,0.131433
8,319515-batman-navrat-temneho-rytire-cast-2,0.123253
9,60899-batman-vs-joker,0.121176


---
### TFIDF on genres

In [17]:
rs_tfidf_genres = TfidfGenresRS(movie_db)

Building the dictionary: 100%|██████████| 8674/8674 [00:00<00:00, 13021.64it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:00<00:00, 14957.10it/s]
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:01<00:00, 8413.52it/s]


In [18]:
history = ['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']
rs_tfidf_genres.recommend(history)

Unnamed: 0,movie_id,score_TfidfGenresRS
0,90518-kovboj-bebop-lovec-odmen,0.867701
1,194929-temny-obraz,0.866236
2,12110-zelezny-obr,0.809641
3,288161-batman-rok-jedna,0.789442
4,71858-ochrance,0.786381
...,...,...
8665,292988-meda,0.000000
8666,249773-parba-ve-vegas,0.000000
8667,68928-pratele,0.000000
8668,234260-teorie-velkeho-tresku,0.000000


### TFIDF on titles only


In [19]:
rs_tfidf_titles = TfidfTitleRS(movie_db)

Building the dictionary: 100%|██████████| 8674/8674 [00:00<00:00, 12438.39it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:00<00:00, 13060.34it/s]
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:01<00:00, 7731.50it/s]


In [20]:
history = ['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']
rs_tfidf_titles.recommend(history, 15)

Unnamed: 0,movie_id,score_TfidfTitleRS
0,1643-kmotr-1-2-3,0.722059
1,308348-pribeh-kmotra,0.590029
2,1646-kmotr-iii,0.553994
3,93895-batman,0.299027
4,1069-batman,0.299027
5,9498-matrix-revolutions,0.254653
6,9497-matrix-reloaded,0.254653
7,366712-the-den,0.218506
8,288161-batman-rok-jedna,0.210468
9,351124-batmanuv-syn,0.204622


## LSA Model

https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [None]:
from gensim.models import LsiModel

# TODO: just copied, not working yet
class LsiRS(GensimModelWrapper):
    def __init__(self, movies_df : pd.DataFrame):
        super(LsiRS, self).__init__(movies_df)
        
        # Document == movie title & description
        documents = list(movies_df.title.astype(str) + ' ' + movies_df.description)
        
        # Build Dictionary
        with get_context('fork').Pool(None) as pool:
            doc_tokens = pool.imap(self.__class__._document_to_tokens, documents)
            doc_tokens = tqdm(doc_tokens, desc='Building the dictionary', total=len(documents))
            self.dictionary = Dictionary(doc_tokens)
            self.__class__.DICTIONARY = self.dictionary
        
        # Build TF-IDF model
        with get_context('fork').Pool(None) as pool:
            #doc_bows = pool.imap(self.__class__._tokens_to_bag_of_words, self.__class__.doc_tokens)
            doc_bows = pool.imap(self.__class__._document_to_bag_of_words, documents)
            doc_bows = tqdm(doc_bows, desc='Building the TF-IDF model', total=len(documents))
            self.lsi_model = LsiModel(doc_bows)
            self.__class__.LSI_MODEL = self.lsi_model

        # Build the index
        with get_context('fork').Pool(None) as pool:    
            #doc_vectors = pool.imap(self.__class__._bow_to_tfidf_vector, self.doc_bows)
            doc_vectors = pool.imap(self.__class__._document_to_tfidf_vector, documents)
            doc_vectors = tqdm(doc_vectors, desc='Building the TF-IDF index', total=len(documents))
            self.index = SparseMatrixSimilarity(doc_vectors, num_docs=len(documents), num_terms=len(self.dictionary))
            
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL 

        self.index_to_movie_id = dict(enumerate(self.movies_df.movie_id.to_list()))

    
    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        # TODO:
        ...

    @classmethod
    def _document_to_lsi_vector(cls, document: str) -> List[Tuple[int, float]]:
        return cls.LSI_MODEL[cls._document_to_bag_of_words(document)]
    
    @classmethod
    def _bow_to_lsi_vector(cls, bow: List[Tuple[int, int]]) -> List[Tuple[int, float]]:
        return cls.LSI_MODEL[bow]

In [23]:
# Build a Dictionary
documents = movie_db.description.to_list()
doc_tokens = list(map(GensimModelWrapper._document_to_tokens, documents))
#doc_tokens = tqdm(doc_tokens, desc='Building the dictionary', total=len(documents))
dictionary = Dictionary(doc_tokens)

In [None]:
def tokens2bow(dct, tokens):
    return dct.doc2bow(tokens)

corpus_bows = [tokens2bow(dictionary, tokens) for tokens in doc_tokens]
#corpus_bows = tqdm(corpus_bows, desc='Building LSI model', total=len(documents))
lsi_model = LsiModel(corpus_bows, num_topics=200, id2word=dictionary)

In [None]:
lsi_model

# BM25 

In [25]:
# TODO: claass BM25BaseRS(RecSysBase):


---
# Ensembles

In [43]:
from sklearn.preprocessing import MinMaxScaler
from __future__ import annotations

class Ensemble:
    """
    Class representing an ensemble of algorithms that individually give scores to movies.
    The final recommendation is made by combining the scores and taking the top n movies.
    
    Parameters
    ----------
    movies_df : pd.DataFrame
        Movie database that is provided to each RecSys based class.

    rs_classes : List[RecSysBase]
        List containing subclasses of RecSysBase to use in the ensemble.

    rs_weights : List[float]
        A list of floats that is used to weight the importance of used algorithms.
        The list should sum to 1 and the order needs to match the order in `rs_classes`.
        By default all the columns have the same weight.
    """
    def __init__(
        self,
        movies_df : pd.DataFrame,
        rs_classes: List[type[RecSysBase]],
        rs_weights: List[float] = None
    ):
        self.movies_df = movies_df
        self.recsystems = []
        self.rs_weights = []

        weights = rs_weights if self._valid_weights(rs_weights, len(rs_classes)) else np.ones(len(rs_classes))

        for rs_cls, w in zip(rs_classes, weights):
            self.append_rs(rs_cls, w)
    
    @staticmethod
    def _valid_weights(weights : List[float], exp_length : int) -> bool:
        """Checks if the weights are not None, all are float and have the expected length"""
        return weights is not None and len(weights) == exp_length and all([isinstance(w, float) for w in weights])
    
    def append_rs(self, rs_class : type[RecSysBase], rs_weight : float = 1) -> Ensemble:
        """Appends a recommender system model to the ensemble."""
        
        print(f'Ensemble: Initializing {rs_class.__name__} ...')
        self.recsystems.append(rs_class(self.movies_df))
        self.rs_weights.append(rs_weight)
        print(f'Ensemble: {rs_class.__name__} done.')
        return self

    def apply_algs(self, movie_ids : List[str], df_merge_how : str = 'inner') -> pd.DataFrame:
        """Returns scores given to movies by each recommendation algorithm.
        
        Parameters
        ----------
        movie_ids : List[str]
            List of movie_ids
        
        df_merge_how : str
            Argument for pd.DataFrame.merge method on how to perform the merge.
            Default is "inner".
        
        Returns
        -------
        pd.DataFrame
            For each `movie_id` there are columns with scores named `score_<name of a RS>`.
        """
        dataframes = [rs.recommend(movie_ids) for rs in self.recsystems]
        
        result_df = dataframes[0]
        
        for df in dataframes[1:]:
            # TODO: watch out for `how` when joining dfs without all movies present
            result_df = result_df.merge(df, how=df_merge_how, on='movie_id')
        
        return result_df

    def combine_scores(
        self,
        scores_table: pd.DataFrame,
        normalize : bool = True,
        rs_weights : List[float] = None
        ) -> pd.DataFrame:
        """
        Combines recommendation scores of the available algorithms.
        
        Parameters
        ----------
        scores_table : pd.DataFrame
            A DataFrame containing at least `movie_id` and scores of the algs as `score_<name of the alg>`
        
        normalize : bool
            Wether to peform min max scaling of each column with score.
            Default is true.
            TODO: alternatively ignore scores and work simply with ranks?
        
        Returns
        -------
        pd.DataFrame
            The final score for each movie is stored in column `final_score`
        """
        
        score_cols = [c for c in scores_table.columns if c.startswith('score_')]
        
        if normalize:
            scaler = MinMaxScaler()
            scores_table[score_cols] = scaler.fit_transform(scores_table[score_cols])
        
        weights = rs_weights if self._valid_weights(rs_weights, len(self.recsystems)) else self.rs_weights

        if any(w != 1 for w in weights):
            # NOTE: make sure that the order matches?
            #print(f'Weighting the recommendations: {weights}')
            scores_table[score_cols] = scores_table[score_cols] * weights
        
        scores_table['final_score'] = scores_table[score_cols].sum(axis=1)
        return scores_table

    
    def recommend(self,
                  movie_ids: List[str],
                  n : int = None,
                  rs_weights : List[float] = None,
                  return_full_info=False) -> List[str]:
        """Produces a list of recommendations by combining results of its ensemble.
        
        Parameters
        ----------
        movie_ids : List[str]
            List of movie ids representing the user history.

        n : int
            Number of movies to recommend.
            Default value is None - all available movies are returned in desceding order by score.  
        
        rs_weights : List[float]
            A list of floats that is used to weight the importance of used algorithms.
            The list should sum to 1 and the order needs to match the order in `self.rs_classes`.
            By default all the columns have the same weight.
            Overrides self.rs_weights.
        """
        scores = self.apply_algs(movie_ids)
        recommendations = self.combine_scores(scores,
                                              normalize=True,
                                              rs_weights=rs_weights).sort_values(by='final_score', ascending=False)
        recommendations = recommendations if n is None else recommendations[:n]
        return recommendations if not return_full_info else recommendations.merge(self.movies_df, how='inner', on='movie_id')
        

    
    def userId_to_history(self, user_id : str) -> List[str]:
        # TODO: helper func. for testing purpose
        ...


In [44]:
ensemble = Ensemble(movie_db, [TopBayesAvgRatingRS, MostRatingsRS, TfidfTitleRS, TfidfDescriptionTitleRS])

Ensemble: Initializing TopBayesAvgRatingRS ...
Ensemble: TopBayesAvgRatingRS done.
Ensemble: Initializing MostRatingsRS ...
Ensemble: MostRatingsRS done.
Ensemble: Initializing TfidfTitleRS ...


Building the dictionary: 100%|██████████| 8674/8674 [00:00<00:00, 15174.10it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:00<00:00, 14653.32it/s]
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:00<00:00, 9691.11it/s] 


Ensemble: TfidfTitleRS done.
Ensemble: Initializing TfidfDescriptionTitleRS ...


Building the dictionary: 100%|██████████| 8674/8674 [00:05<00:00, 1727.64it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:12<00:00, 715.64it/s] 
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:12<00:00, 671.82it/s]


Ensemble: TfidfDescriptionTitleRS done.


In [45]:
print(history)
ensemble.recommend(history, n=15)

['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']


Unnamed: 0,movie_id,score_TopBayesAvgRatingRS_bayes_avg_rating,score_MostRatingsRS_num_ratings,score_TfidfTitleRS,score_TfidfDescriptionTitleRS,final_score
271,1646-kmotr-iii,0.870542,0.143663,0.767242,1.0,2.781447
240,1643-kmotr-1-2-3,0.876873,0.026519,1.0,0.87583,2.779222
23,223734-temny-rytir,0.951909,0.787012,0.0,0.430337,2.169258
706,228329-avatar,0.820036,1.0,0.0,0.060627,1.880663
442,1069-batman,0.844674,0.171643,0.414131,0.421004,1.851452
118,254156-pocatek,0.906238,0.835039,0.0,0.072225,1.813502
3275,308348-pribeh-kmotra,0.712402,0.137398,0.817149,0.115541,1.78249
303,252669-temny-rytir-povstal,0.865476,0.628941,0.0,0.263657,1.758074
0,2294-vykoupeni-z-veznice-shawshank,1.0,0.651493,0.0,0.067825,1.719318
160,136224-batman-zacina,0.897306,0.352892,0.245835,0.22114,1.717173


In [46]:
ensemble.recommend(history, n=15, rs_weights=[0.25, 0.1, 0.25, 0.8], return_full_info=True)

Unnamed: 0,movie_id,score_TopBayesAvgRatingRS_bayes_avg_rating,score_MostRatingsRS_num_ratings,score_TfidfTitleRS,score_TfidfDescriptionTitleRS,final_score,title,description,genres,countries,year,creators,avg_stars,num_ratings,bayes_avg_rating
0,1646-kmotr-iii,0.217636,0.014366,0.191811,0.8,1.223812,Kmotr III,Člověk financí a politiky. Al Pacino v hlavní ...,"[Drama, Krimi]",[USA],1990,"[Al Pacino, Diane Keaton, Talia Shire, Francis...",4.309144,689,4.201318
1,1643-kmotr-1-2-3,0.219218,0.002652,0.25,0.700664,1.172534,"Kmotr 1, 2, 3",Původní trilogie KMOTR je zde sestříhána v chr...,"[Krimi, Drama, Thriller]",[USA],1992,"[Marlon Brando, Al Pacino, Robert Duvall, Fran...",4.828125,128,4.22692
2,1069-batman,0.211168,0.017164,0.103533,0.336803,0.668668,Batman,"V Gothamu bojuje proti zločinu nejen policie, ...","[Akční, Krimi, Thriller]","[USA, Velká Británie]",1989,"[Michael Keaton, Jack Nicholson, Kim Basinger,...",4.173755,823,4.096705
3,392898-lego-r-batman-film,0.175894,0.006724,0.056167,0.428564,0.667349,LEGO® Batman film,Batman pokračuje v neúnavné obraně Gotham City...,"[Animovaný, Akční, Dobrodružný, Komedie, Rodin...","[USA, Dánsko]",2017,"[Will Arnett, Jenny Slate, Ralph Fiennes, Chri...",3.5387,323,3.526102
4,9498-matrix-revolutions,0.170655,0.018229,0.088169,0.389934,0.666987,Matrix Revolutions,Ve výbušné závěrečné kapitole trilogie Matrix ...,"[Akční, Sci-Fi]","[USA, Austrálie]",2003,"[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",3.435927,874,3.441355
5,223734-temny-rytir,0.237977,0.078701,0.0,0.34427,0.660948,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770,4.530368
6,319515-batman-navrat-temneho-rytire-cast-2,0.211921,0.00426,0.047185,0.31584,0.579206,"Batman: Návrat Temného rytíře, část 2.","Filmová adaptace komiksu ""Návrat Temného rytíř...","[Animovaný, Akční]",[USA],2013,"[Peter Weller, Ariel Winter, Michael McKean, J...",4.42439,205,4.108885
7,9497-matrix-reloaded,0.186074,0.01944,0.088169,0.27168,0.565363,Matrix Reloaded,Druhé pokračování trilogie nás opět zavede do...,"[Akční, Sci-Fi]","[USA, Austrálie]",2003,"[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",3.713519,932,3.690777
8,6-maffiosso,0.167248,0.004114,0.0,0.392652,0.564014,Maffiósso,Mistr filmové parodie Jim Abrahams si vzal ten...,"[Komedie, Drama, Krimi]",[USA],1998,"[Jay Mohr, Billy Burke, Christina Applegate, J...",3.333333,198,3.386243
9,274187-batman-vs-red-hood,0.200124,0.003759,0.053705,0.300289,0.557877,Batman vs. Red Hood,Tento akcí nabitý animovaný Batman staví proti...,"[Animovaný, Akční, Krimi, Thriller]",[USA],2010,"[Bruce Greenwood, Jason Isaacs, Neil Patrick H...",4.165746,181,3.918054


---
# Evaluation

1. pick user
2. get his history
3. split history into train & test
4. get recommendations using train
5. compare recomendations & test sets using jaccard_similarity

In [47]:
from sklearn.metrics import jaccard_score

In [48]:
from typing import Set

def jaccard_similarty(a : Set[Any], b : Set[Any]) -> float:
    """Computes Jaccard similarity between two sets"""

    intersection = len(a.intersection(b))
    union = (len(a) + len(b)) - intersection
    return float(intersection) / union

In [49]:
jaccard_similarty(
    set(list(range(1, 200))),
    set([1] + list(range(-1, -19)))
)

0.005025125628140704

In [51]:
class Metric:
    ...
    # TODO: implement: (y_pred, y_true) - jaccard, recall, precision, ...

class Evaluator:
    def __init__(
        self,
        rec_system : RecSysBase | Ensemble,
        metrics : List[Metric],
        
    ):
        ...

    


In [77]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# way way faster when groupby() does not have to be repeated
GROUPED = ratings.groupby('username')

# all ratings for a username
def get_rated_movies_for_user(username : str, only_movie_ids : bool = False) -> pd.DataFrame:
    filtered = GROUPED.get_group(username)
    return filtered.movie_id.tolist() if only_movie_ids else filtered


def _get_usernames_with_min_max_ratings(min_ratings : int, max_ratings : int) -> pd.DataFrame:
    assert min_ratings <= max_ratings
    df = ratings.username.value_counts().sort_values(ascending=False).to_frame('count').reset_index()
    df.columns = ['username', 'count']
    return df[df['count'].between(min_ratings, max_ratings)]

def get_user2history_map(
    n : int = None,                 # limit number of sampled users; None -> take all from the range
    min_ratings : int = 2,          # min ratings to consider a user
    max_ratings : int = 150,        # max ratings to consider a user
    force_n : bool = True,          # raise expection if input parameters are in conflict with the result, e.g. not enough such users
    rnd_seed : int = 42             # seed for pd.DataFrame.sample reproducibility
) -> Dict[str, List[str]]:          # dict {username: [movie_ids] }
    
    candidate_usernames = _get_usernames_with_min_max_ratings(min_ratings, max_ratings)

    if n is None: n = len(candidate_usernames)
    
    if len(candidate_usernames) < n and force_n:
        raise RuntimeError(
            f'Not enough candidate users to sample from.\n'
            f'{len(candidate_usernames)} users found but {n} are required. '
            f'Input parameters: min={min_ratings}, max={max_ratings}'
        )

    # With disabled force_n allow fewer users on output
    n = min(n, len(candidate_usernames))
    sampled_usernames = candidate_usernames.sample(n, random_state=rnd_seed).username
    
    # Return mapping from user to a list of movie_ids
    return { un: get_rated_movies_for_user(un, only_movie_ids=True) for un in tqdm(sampled_usernames) }
    

def train_test_split_user2history(
    mapping : Dict[str, List[str]],
    train_size : int | float = None,
    test_size: int | float = None,
    shuffle : bool = False,
    rnd_seed : int = 42
    ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
    """Splits results of get_user2history_map() into train and test set.
    
    Parameters
    ----------
    mapping : Dict[str, List[str]]
        Map from username to list of movie_ids.

    train_ratio : float
        Train subset ratio.

    shuffle : bool
        Whether to shuffle records.
        By default the order of rated movies is kept.

    rnd_state : int
        Seed for reproducibility of train_test split when using shuffle = True
    Returns
    -------
    Tuple[Dict[str, List[str]], Dict[str, List[str]]]
        Train & test subsets as a tuple.
    """
    train_map, test_map = dict(), dict()

    for k, v_list in mapping.items():
        train_split, test_split = train_test_split(v_list, train_size = train_size, test_size=test_size, shuffle = shuffle, random_state = rnd_seed)
        train_map[k] = train_split
        test_map[k] = test_split
    
    return train_map, test_map

In [76]:
# generate history
user2hist = get_user2history_map(n=1000, min_ratings=100, max_ratings=100, force_n=False, rnd_seed=0)
display(user2hist.keys())

100%|██████████| 51/51 [00:00<00:00, 2660.73it/s]


dict_keys(['Shelby14', 'pouler', 'Rogero', 'moviechecker', 'Elissabet', 'YoZo8', '!Borec!', 'Kokutek', 'muskcz', 'Anomiska', 'Cortyx', 'EarlHickey', 'heroin88', 'weca', 'XI-XAO', 'Medwídek', 'IAMAD', 'Baki', 'enterama', 'savage3303', 'jurama525', 'cepice', 'hrabě_olaf', 'MarryW', 'cirkulace', 'Psychous', 'JeffScylla', 'Jano...', 'luukaas', 'štófy', 'Vinczent', 'Crytic', 'Skautka', 'maara666', 'Jerryy', 'Obivan', 'renegrido', 'Kulda24', 'Birter', 'PfefferSalz', 'kaktusacek13', 'saillor', 'Polnoch', 'Velislav', 'Bajda', 'Najt', 'Immortal17', 'Kakho-oto', 'upas', 'bond77', 'matmanX'])

In [78]:
import pickle

# with open('user2hist.pickle', 'wb') as f:
#     pickle.dump(user2hist, f, protocol=pickle.HIGHEST_PROTOCOL)

# with open('user2hist_0.8.pickle', 'rb') as f:
#     user2hist = pickle.load(f)

In [75]:
import multiprocessing as mp

resutls = []

train, test = train_test_split_user2history(user2hist, 0.8)

# def foo(username: str):
#     recs = ensemble.recommend(train[username], n=25, return_full_info=True, rs_weights=[0.1, 0.1, 0.3, 0.8])
#     return jaccard_similarty(set(test), set(recs.movie_id.tolist()))

# with mp.Pool(4) as p:
#     p.map(foo, train.keys())

for username in tqdm(user2hist.keys()):
    recommendations = ensemble.recommend(train[username], n=25, return_full_info=False, rs_weights=[0.1, 0.1, 0.3, 0.8])

    score = jaccard_similarty(set(test), set(recommendations.movie_id.tolist()))
    if score != 0:
        print(f'{username}: {score}\t{len(user2hist[username])}')
        resutls.append((username, score, len(user2hist[username])))
    #display(recommendations)

100%|██████████| 1000/1000 [04:52<00:00,  3.41it/s]


---

### Examples

a) recommendations for movie Rivals about F1 race drives:  
* solid recommendations almost all movies are about main characters competing / rivalry (in some form)
* bad recommendations:
    * 295103-domek-z-karet - does not fit at all
    * Rivalové (14623-rivalove) - is ranked as 1st because of the title match unless we weight down the TfidfTitleRS

In [None]:
rivals = '301401-rivalove'

sport_movies = [rivals]

ensemble.recommend(sport_movies, n=15, rs_weights=[0.25, 0.1, 0.25, 0.5], return_full_info=True)

In [None]:
display(movie_db[movie_db.title == 'Mlčení jehňátek'])

---
Movie `2777-svedska-trojka` is not a good recommendation

In [None]:
lion_king = '6741-lvi-kral'
madagaskar = '117282-madagaskar'
over_the_hedge = '43170-za-plotem'

disney_movies = [lion_king, madagaskar, over_the_hedge]

ensemble.recommend(disney_movies, n=15, rs_weights=[1,0.1,1,1], return_full_info=True)

In [None]:
ryan = '8652-zachrante-vojina-ryana'
bros = '70341-bratrstvo-neohrozenych'

war_movies = [ryan, bros]

ensemble.recommend(war_movies, n=15, rs_weights=[0.6, 0.1, 0.6, 1], return_full_info=True)

The Silence of the lambs
* looks good overall

In [None]:
lambs = '2356-mlceni-jehnatek'

ensemble.recommend([lambs], n=15, rs_weights=[0.5, 0.1, 0.35, 1], return_full_info=True)

### Problem: 4 completely distinct movies, but top recomendations are dominated by one of them

In [None]:
ensemble.recommend([lambs, rivals, bros, madagaskar], n=15, rs_weights=[0.5, 0.1, 0, 1], return_full_info=True)

## Problems & questions

* weights of algs in an ensemble are just guessed
* the way queries are craeted -> for TF-IDF we just create one big document from documents of all seen movies
* for movie Spiderman -> just spiderman movies nothing else, no discovery of new taste

---
# Evaluation Playground

In [None]:
ratings['username'].value_counts()[ratings['username'].value_counts() == 50]

In [None]:
ratings[ratings.username == 'Mord122'].merge(movie_db, how='left', on='movie_id')

---
# Playground

In [None]:
df = pd.DataFrame({
    'movie_id': ['A', 'B', 'C'],
    'avg_stars': [5, 4.8, 4.6],
    'num_ratings': [10, 100, 1000]
})

M = 3.5#np.sum(df.avg_stars * df.num_ratings) / np.sum(df.num_ratings) 
C = 100#df.num_ratings.quantile(0.25)

def compute_bayes_avg(item_rating_avg, item_rating_count, m, c):
    return (item_rating_avg*item_rating_count + c*m) / (item_rating_count + c)

df['bayes_avg'] = compute_bayes_avg(df.avg_stars, df.num_ratings, M,C)
df

In [None]:
genre_popularity = movie_db.genres.explode().value_counts().sort_values(ascending=False)
genre_popularity

In [None]:
import wordcloud
import matplotlib.pyplot as plt 

item_count_list_genres = analyse_freq(movies['genres'])
genres, genre_counts = [list(t) for t in zip(*item_count_list_genres)]

wc = wordcloud.WordCloud(background_color='white')
wc.generate_from_frequencies(dict(zip(genres, genre_counts)))

plt.imshow(wc)

### LSI

In [None]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
document_list = movie_db.description.to_list()
titles = movie_db.title.to_list()

In [None]:
docs_preprocessed = list(tqdm(map(GensimModelWrapper._document_to_tokens, documents), desc='Processing', total=len(documents)))

In [None]:
docs_preprocessed

### Additional Stopwords (stems)
Too many to manually process, our Czech preprocessing is shit
**examples:**   
jak, abym, film?, však, kter, tak, svo, tom, svéh, měl, čím, the, zda, někd, ...


In [None]:
a = analyse_freq(docs_preprocessed)

In [None]:
a[::-1]

In [None]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary, doc_term_matrix

In [None]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean, start, stop, step):
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [None]:
number_of_topics = 100
words = 5

start,stop,step=20,200,15
plot_graph(docs_preprocessed,start,stop,step)
# 110 was optimal

In [None]:
number_of_topics = 100
words = 10

start,stop,step=20,200,15
plot_graph(docs_preprocessed,start,stop,step)
# 110 was optimal

In [None]:
number_of_topics = 100
words = 15

start,stop,step=20,200,15
plot_graph(docs_preprocessed,start,stop,step)
# 110 was optimal

In [None]:
number_of_topics = 100
words = 20

start,stop,step=20,200,15
plot_graph(docs_preprocessed,start,stop,step)
# 110 was optimal

In [None]:
number_of_topics = 100
words = 30

start,stop,step=20,200,15
plot_graph(docs_preprocessed,start,stop,step)
# 110 was optimal

In [None]:
#document_list,titles=load_data("","articles.txt")
#clean_text=preprocess_data(document_list)

#model=create_gensim_lsa_model(docs_preprocessed,number_of_topics,words)
model=create_gensim_lsa_model(docs_preprocessed,number_of_topics=35,words=10)



In [None]:
'o' in stop_words.get_stop_words('czech')