In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import random

from typing import Any, Dict, List, Tuple, Union
from pathlib import Path

from recommend.utils import PROJ_ROOT

# Utility functions

In [2]:
def creators2list(creators : str, top_n_actors : int = 3) -> List[str]:
    """Returns a list of `top_n_actors`, director, and a composer."""
    result = []
    
    if 'Hrají' in creators:
        result = creators['Hrají'][:top_n_actors]
        result = list(map(lambda x: x[1], result))

    if 'Režie' in creators:
        result += [creators['Režie'][0][1]]
    if 'Hudba' in creators:
        result += [creators['Hudba'][0][1]]
    return result


def analyse_freq(list_of_lists : List[List[Any]]) -> List[Tuple[Any, int]]:
    """
    Counts occurences of items in a list of lists
    and returns them in descending order as tuples <item, count>."""

    flat_list = [item for sublist in list_of_lists for item in sublist]
    counter = collections.Counter(flat_list)
    return counter.most_common(len(counter))


def get_value(df : pd.DataFrame, movie_id : str, col : str) -> Any:
    """Retrieves a value in a cell specified by `col` for movie with ID `movie_id`"""
    return df.loc[df['movie_id'] == m_id][col].values[0]

### Czech stemmer

In [3]:
#! /usr/bin/env python3.1
''' Czech stemmer
Copyright © 2010 Luís Gomes <luismsgomes@gmail.com>.

Ported from the Java implementation available at:
    http://members.unine.ch/jacques.savoy/clef/index.html

'''
import re
import sys



def cz_stem(word, aggressive=False):
    if not re.match("^\\w+$", word):
        return word
    if not word.islower() and not word.istitle() and not word.isupper():
        #print("warning: skipping word with mixed case: {}".format(word),
              #file=sys.stderr)
        return word
    s = word.lower() # all our pattern matching is done in lowercase
    s = _remove_case(s)
    s = _remove_possessives(s)
    if aggressive:
        s = _remove_comparative(s)
        s = _remove_diminutive(s)
        s = _remove_augmentative(s)
        s = _remove_derivational(s)
    if word.isupper():
        return s.upper()
    if word.istitle():
        return s.title()
    return s

def _remove_case(word):
    if len(word) > 7 and word.endswith("atech"):
        return word[:-5]
    if len(word) > 6:
        if word.endswith("ětem"):
            return _palatalise(word[:-3])
        if word.endswith("atům"):
            return word[:-4]
    if len(word) > 5:
        if word[-3:] in {"ech", "ich", "ích", "ého", "ěmi", "emi", "ému",
                         "ete", "eti", "iho", "ího", "ími", "imu"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"ách", "ata", "aty", "ých", "ama", "ami",
                         "ové", "ovi", "ými"}:
            return word[:-3]
    if len(word) > 4:
        if word.endswith("em"):
            return _palatalise(word[:-1])
        if word[-2:] in {"es", "ém", "ím"}:
            return _palatalise(word[:-2])
        if word[-2:] in {"ům", "at", "ám", "os", "us", "ým", "mi", "ou"}:
            return word[:-2]
    if len(word) > 3:
        if word[-1] in "eiíě":
            return _palatalise(word)
        if word[-1] in "uyůaoáéý":
            return word[:-1]
    return word

def _remove_possessives(word):
    if len(word) > 5:
        if word[-2:] in {"ov", "ův"}:
            return word[:-2]
        if word.endswith("in"):
            return _palatalise(word[:-1])
    return word

def _remove_comparative(word):
    if len(word) > 5:
        if word[-3:] in {"ejš", "ějš"}:
            return _palatalise(word[:-2])
    return word

def _remove_diminutive(word):
    if len(word) > 7 and word.endswith("oušek"):
        return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"eček", "éček", "iček", "íček", "enek", "ének",
                         "inek", "ínek"}:
            return _palatalise(word[:-3])
        if word[-4:] in {"áček", "aček", "oček", "uček", "anek", "onek",
                         "unek", "ánek"}:
            return _palatalise(word[:-4])
    if len(word) > 5:
        if word[-3:] in {"ečk", "éčk", "ičk", "íčk", "enk", "énk",
                         "ink", "ínk"}:
            return _palatalise(word[:-3])
        if word[-3:] in {"áčk", "ačk", "očk", "učk", "ank", "onk",
                         "unk", "átk", "ánk", "ušk"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"ek", "ék", "ík", "ik"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ák", "ak", "ok", "uk"}:
            return word[:-1]
    if len(word) > 3 and word[-1] == "k":
        return word[:-1]
    return word

def _remove_augmentative(word):
    if len(word) > 6 and word.endswith("ajzn"):
        return word[:-4]
    if len(word) > 5 and word[-3:] in {"izn", "isk"}:
        return _palatalise(word[:-2])
    if len(word) > 4 and word.endswith("ák"):
        return word[:-2]
    return word

def _remove_derivational(word):
    if len(word) > 8 and word.endswith("obinec"):
        return word[:-6]
    if len(word) > 7:
        if word.endswith("ionář"):
            return _palatalise(word[:-4])
        if word[-5:] in {"ovisk", "ovstv", "ovišt", "ovník"}:
            return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"ásek", "loun", "nost", "teln", "ovec", "ovík",
                         "ovtv", "ovin", "štin"}:
            return word[:-4]
        if word[-4:] in {"enic", "inec", "itel"}:
            return _palatalise(word[:-3])
    if len(word) > 5:
        if word.endswith("árn"):
            return word[:-3]
        if word[-3:] in {"ěnk", "ián", "ist", "isk", "išt", "itb", "írn"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"och", "ost", "ovn", "oun", "out", "ouš",
                         "ušk", "kyn", "čan", "kář", "néř", "ník",
                         "ctv", "stv"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"áč", "ač", "án", "an", "ář", "as"}:
            return word[:-2]
        if word[-2:] in {"ec", "en", "ěn", "éř", "íř", "ic", "in", "ín",
                         "it", "iv"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn",
                         "dl", "nk", "tv", "tk", "vk"}:
            return word[:-2]
    if len(word) > 3 and word[-1] in "cčklnt":
        return word[:-1]
    return word

def _palatalise(word):
    if word[-2:] in {"ci", "ce", "či", "če"}:
        return word[:-2] + "k"

    if word[-2:] in {"zi", "ze", "ži", "že"}:
        return word[:-2] + "h"

    if word[-3:] in {"čtě", "čti", "čtí"}:
        return word[:-3] + "ck"

    if word[-3:] in {"ště", "šti", "ští"}:
        return word[:-3] + "sk"
    return word[:-1]

"""
if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"):
        sys.exit("usage: {} light|aggressive".format(sys.argv[0]))
    aggressive = sys.argv[1] == "aggressive"
    for line in sys.stdin:
        print(*[cz_stem(word, aggressive=aggressive)
                for word in line.split()])
"""
pass

# Load datasets

In [4]:
movies = pd.read_pickle(PROJ_ROOT / 'data' / 'movies.pkl')
ratings = pd.read_pickle(PROJ_ROOT / 'data' / 'ratings.pkl')

# Drop irrelevant columns
movies = movies.drop(columns=['kind', 'length', 'poster', 'foreign_titles'])

# Convert creators column to list of creators
movies['creators'] = movies['creators'].apply(creators2list)
#movies.reset_index(inplace=True) # causes trouble with join

display(movies.head(3))
ratings.head(3)

Unnamed: 0_level_0,title,description,genres,countries,year,creators
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
230421-houbicky,Houbičky,"Partička amerických teenagerů, která si vyrazi...","[Horor, Thriller]","[Irsko, Velká Británie, Dánsko]",2007,"[Lindsey Haun, Jack Huston, Max Kasch, Paddy B..."
10789-prvni-liga,První liga,V nejvyšší lize získávají hráči baseballu boha...,"[Komedie, Sportovní]",[USA],1989,"[Tom Berenger, Charlie Sheen, Corbin Bernsen, ..."
235032-yes-man,Yes Man,"Carl Allen je zatrpklý bankovní úředník, který...","[Komedie, Romantický]",[USA],2008,"[Jim Carrey, Zooey Deschanel, Bradley Cooper, ..."


Unnamed: 0,username,movie_id,stars,date,comment
0,kinghome,230421-houbicky,5.0,2011-11-13,Hodnocení některých šašků tady opravdu necháp...
1,SimonShot,230421-houbicky,5.0,2012-12-19,Tento snímek je zajímavý už jenom tím že se n...
2,blackend,230421-houbicky,5.0,2010-09-04,Pohoda a vzhledem k dobré atmosféře a nízkému...


In [5]:
# get mean rating of each film 
avg_ratings = ratings[['movie_id', 'stars']].groupby(['movie_id']).mean()
avg_ratings.rename(columns={'stars': 'avg_stars'}, inplace=True)

# get rating count of each film
count_ratings = ratings[['movie_id', 'stars']].groupby('movie_id').count()
count_ratings.rename(columns={'stars': 'num_ratings'}, inplace=True)
count_ratings

# join with the movies database
movie_db = movies.join(avg_ratings).join(count_ratings).sort_values(by='num_ratings')
movie_db = movie_db.reset_index()
movie_db

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
0,75613-hercule-poirot_498504-serie-9,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",5.000000,1
1,613789-obeti,Oběti,Na televizní obrazovku se vrací cyklus o lide...,[Drama],[Česko],(1999–2008),"[Zbyněk Fric, Karel Zima, Libor Žídek, Petr Sl...",5.000000,1
2,75613-hercule-poirot_498507-serie-12,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",4.500000,2
3,350930-krtek,Krtek,,"[Animovaný, Dobrodružný]","[Česko, Finsko]",2011,"[Zdeněk Miler, Wiliam Bukový]",4.500000,2
4,33863-bajaja,Bajaja,Jiří Trnka natočil v roce 1950 volně podle poh...,"[Animovaný, Loutkový, Pohádka]",[Československo],1950,"[Jiří Trnka, Václav Trojan]",4.285714,14
...,...,...,...,...,...,...,...,...,...
8669,232938-hobit-neocekavana-cesta,Hobit: Neočekávaná cesta,Film sleduje cestu hlavní postavy Bilbo Pytlík...,"[Dobrodružný, Fantasy]","[USA, Nový Zéland]",2012,"[Martin Freeman, Ian McKellen, Richard Armitag...",3.953150,3159
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368000,4000



# Recommendation Systems
---
## Base class

In [6]:
import abc

class RecSysBase(abc.ABC):
    """
    Recommendation System Base class.
    
    The system should store a database of available movies.
    
    """
    def __init__(self, movies_df : pd.DataFrame):
        self.movies_df = movies_df

    @abc.abstractmethod
    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> Union[List[str], pd.DataFrame]:
        """
        Generates recommendations based on user's history of rated movies.
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie ids a user has rated so far.
            
        n : int
            The number of recommendations to return.
        
        Returns
        -------
        Union[List[str], pd.DataFrame]
            By default returns a list of strings representeing movie ids.
            If `return_df` is set to True then a pandas dataframe.
        """
        # TODO: do we need `user_history` as List?
        # use List if: some systems use the order of items (e.g. preference of recent movies)
        # use Set if: we dont care at all about 
        pass

    def get_candidate_movies(self, user_history : List[str]) -> pd.DataFrame:
        """Returns a list of candidates by filtering movies a user has already seen."""
        df = pd.concat([self.movies_df, self._ids2df(user_history)])
        # convert to str because drop_duplicates does not work on dfs containing structures
        return df.loc[df.astype(str).drop_duplicates(keep=False).index]

    def _ids2df(self, user_history : List[str]) -> pd.DataFrame:
        """For a list of movie ids returns a dataframe compatible with `self.movies_df`"""
        mask = self.movies_df['movie_id'].isin(user_history)
        return self.movies_df.loc[mask]

    
    def get_description_with_id(self, movie_id : str) -> str:
        """Returns description for movie with provided ID"""
        #TODO: handle case when no such movie exists
        return self.movies_df.loc[self.movies_df['movie_id'] == movie_id].description.values[0]
    

## Random recommendations

In [7]:
import random

class RandomRS(RecSysBase):
    """
    System that recommends movies randomly.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(RandomRS, self).__init__(movies_df)

    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """
        Recommends n random movies.
        Movies from `user_history` do not appear in the recommendations.
        """
        recommendations = self.get_candidate_movies(user_history).sample(n)
        return recommendations if return_df else recommendations.movie_id
    


In [8]:
rnd_rs = RandomRS(movie_db)

rnd_rs.recommend([], 3, return_df=True)

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
1869,225944-vlci,Vlci,Dvě znepřátelené smečky vlkodlaků zápasí o mal...,"[Akční, Fantasy, Horor, Thriller]","[Kanada, USA, Německo]",2006,"[Jason Behr, Elias Koteas, Rhona Mitra, James ...",1.53125,96
2369,8624-1941,1941,"V roce 1941, šest dní po přepadení Pearl Harbo...","[Komedie, Válečný]",[USA],1979,"[Dan Aykroyd, Ned Beatty, John Belushi, Steven...",2.9,110
2027,236532-gabriel-andel-pomsty,Gabriel - Anděl pomsty,Dříve bylo v nebi hned sedm archandělů. Dříve....,"[Akční, Fantasy, Horor, Thriller]",[Austrálie],2007,"[Andy Whitfield, Dwaine Stevenson, Samantha No...",2.03,100


## Recommendation based on movie popularity

In [9]:

class PopularityRS(RecSysBase):
    """
    System recommends top n most popular movies a user has not rated yet.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(PopularityRS, self).__init__(movies_df)

    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """
        Recommends n most rated movies a user has not yet seen.
        """
        recommendations = self.get_candidate_movies(user_history).sort_values(by='num_ratings').tail(n)
        return recommendations if return_df else recommendations.movie_id[0]
    

In [10]:
history = ['228329-avatar']

rs = PopularityRS(movie_db)

rs.recommend(history, 3, return_df=True)

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368,4000


## TF-IDF Model

In [11]:
import stop_words
from nltk.tokenize import word_tokenize
from multiprocessing import get_context, Pool
from gensim.corpora import Dictionary
from gensim.matutils import cossim
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from gensim.utils import simple_preprocess
from tqdm import tqdm


class TfidfRS(RecSysBase):
    """
    System recommends unseen movies using TF-IDF
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfRS, self).__init__(movies_df)
        
        # Document == movie title & description
        documents = list(movies_df.title.astype(str) + ' ' + movies_df.description)
        
        # Build Dictionary
        with get_context('fork').Pool(None) as pool:
            doc_tokens = pool.imap(self.__class__._document_to_tokens, documents)
            doc_tokens = tqdm(doc_tokens, desc='Building the dictionary', total=len(documents))
            self.dictionary = Dictionary(doc_tokens)
            self.__class__.DICTIONARY = self.dictionary
        
        # Build TF-IDF model
        with get_context('fork').Pool(None) as pool:
            #doc_bows = pool.imap(self.__class__._tokens_to_bag_of_words, self.__class__.doc_tokens)
            doc_bows = pool.imap(self.__class__._document_to_bag_of_words, documents)
            doc_bows = tqdm(doc_bows, desc='Building the TF-IDF model', total=len(documents))
            self.tfidf_model = TfidfModel(doc_bows)
            self.__class__.TFIDF_MODEL = self.tfidf_model

        # Build the index
        with get_context('fork').Pool(None) as pool:    
            #doc_vectors = pool.imap(self.__class__._bow_to_tfidf_vector, self.doc_bows)
            doc_vectors = pool.imap(self.__class__._document_to_tfidf_vector, documents)
            doc_vectors = tqdm(doc_vectors, desc='Building the TF-IDF index', total=len(documents))
            self.index = SparseMatrixSimilarity(doc_vectors, num_docs=len(documents), num_terms=len(self.dictionary))
            
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL 

        self.index_to_movie_id = dict(enumerate(self.movies_df.movie_id.to_list()))

    
    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """Recommends unseen movies using tfidf vectors and cosine similarity.
        
        TODO: Is it better to yield movies one by one or return a list / DF?
        TODO: solve because `n` and `return_df` are currently not used
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie IDs
        """
        # Take last k movies, join them into a single description and create a query.
        k = 50 # TODO: should this be a customizable parameter? or should it be tuned?
        last_k_movies = user_history[-k:]
        query = ' '.join([self.get_description_with_id(mid) for mid in last_k_movies])
        
        self.__class__.DICTIONARY = self.dictionary
        self.__class__.TFIDF_MODEL = self.tfidf_model

        query_vector = self.__class__._document_to_tfidf_vector(query)
        similarities = enumerate(self.index[query_vector])
        similarities = sorted(similarities, key=lambda item: item[1], reverse=True)

        for idx, sim in similarities:
            movie_id = self.index_to_movie_id[idx]
            if movie_id not in user_history:
                yield movie_id, sim
        
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL

    @classmethod
    def _document_to_tokens(cls, document: str) -> List[str]:
        """
        Filters stop words and applies stemming on the given string.

        Parameters
        ----------
        text : str
            A simple string containing text to preprocess.
        """
        tokens = word_tokenize(document.lower(), language='czech')
        filtered = [w.strip(",.:'?!()[]{}<>_") for w in document.split() if w not in set(stop_words.get_stop_words('czech'))]
        res = [cz_stem(word) for word in filtered]
        return res
    
    @classmethod
    def _document_to_bag_of_words(cls, document: str) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(cls._document_to_tokens(document))
    
    @classmethod
    def _document_to_tfidf_vector(cls, document: str) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[cls._document_to_bag_of_words(document)]

    @classmethod
    def _tokens_to_bag_of_words(cls, tokens: List[str]) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(tokens)
    
    @classmethod
    def _bow_to_tfidf_vector(cls, bow: List[Tuple[int, int]]) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[bow]

In [12]:
rs_tfidf = TfidfRS(movie_db)

Building the dictionary: 100%|███████████████████████████████████████████████████████████████| 8674/8674 [00:05<00:00, 1629.15it/s]
Building the TF-IDF model: 100%|█████████████████████████████████████████████████████████████| 8674/8674 [00:06<00:00, 1314.13it/s]
Building the TF-IDF index: 100%|█████████████████████████████████████████████████████████████| 8674/8674 [00:08<00:00, 1047.15it/s]


In [13]:
history = ['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']

for i, (movie_id, sim)  in enumerate(rs_tfidf.recommend(history)):
    print(f'{i+1}. {movie_id} ({sim:.4f})')
    if i == 15: break

1. 1646-kmotr-iii (0.2877)
2. 1643-kmotr-1-2-3 (0.1815)
3. 392898-lego-r-batman-film (0.1460)
4. 9498-matrix-revolutions (0.1383)
5. 266796-liga-spravedlivych-krize-na-dvou-zemich (0.1189)
6. 223734-temny-rytir (0.1188)
7. 8254-spinave-ulice (0.1153)
8. 1069-batman (0.1152)
9. 18798-valachiho-svedectvi (0.1091)
10. 234816-liga-spravedlnosti (0.1056)
11. 6444-nelitostny-souboj (0.1037)
12. 60899-batman-vs-joker (0.1011)
13. 182290-marie-antoinetta (0.1002)
14. 234081-opravnene-vrazdy (0.0995)
15. 319515-batman-navrat-temneho-rytire-cast-2 (0.0981)
16. 8216-batman-navzdy (0.0970)


## LSA Model

In [None]:
#TODO

---
# Playground

In [None]:
df_matrix = movie_db[movie_db['movie_id'].str.contains('matrix')].tail(3)
df_kmotr = movie_db[movie_db['title'].str.contains('Kmotr')].tail(3)
df_nolan = movie_db[movie_db.creators.map(set(['Christopher Nolan']).issubset)].tail(3)

test_df = pd.concat([df_matrix, df_kmotr, df_nolan])
test_df

In [None]:
rs_test2 = TfidfRS(test_df)

In [None]:
for movie, sim in rs_test2.recommend(['1644-kmotr']):
    print(f'{sim:.4f}: {movie}')

In [None]:
for movie, sim in rs_test2.recommend(['9499-matrix']):
    print(f'{sim:.4f}: {movie}')

In [None]:
#!pip install pyspark
#!pip install spark-nlp

In [None]:
import sparknlp
from sparknlp.pretrained import LemmatizerModel

lemmatizer = LemmatizerModel.pretrained("lemma", "cs")\
                .setInputCols(["token"])\
                .setOutputCol("lemma")

In [None]:
# !pip install simplemma

In [None]:
import simplemma

langdata = simplemma.load_data('cs')

for w in ['žebráci', 'žebral']:
    lemma = simplemma.lemmatize(w, langdata)
    stem = cz_stem(lemma)
    print(f'{w} -> {lemma} -> {stem}')

In [None]:
genre_popularity = movie_db.genres.explode().value_counts().sort_values(ascending=False)
genre_popularity

In [None]:
import wordcloud
import matplotlib.pyplot as plt 

item_count_list_genres = analyse_freq(movies['genres'])
genres, genre_counts = [list(t) for t in zip(*item_count_list_genres)]

wc = wordcloud.WordCloud(background_color='white')
wc.generate_from_frequencies(dict(zip(genres, genre_counts)))

plt.imshow(wc)