In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import random
from tqdm import tqdm


from typing import Any, Dict, List, Tuple, Union
from pathlib import Path

# from recommend.utils import PROJ_ROOT
from pathlib import Path
PROJ_ROOT =  "../../../"

# Utility functions

In [2]:
def creators2list(creators : str, top_n_actors : int = 3) -> List[str]:
    """Returns a list of `top_n_actors`, director, and a composer."""
    result = []
    
    if 'Hrají' in creators:
        result = creators['Hrají'][:top_n_actors]
        result = list(map(lambda x: x[1], result))

    if 'Režie' in creators:
        result += [creators['Režie'][0][1]]
    if 'Hudba' in creators:
        result += [creators['Hudba'][0][1]]
    return result


def analyse_freq(list_of_lists : List[List[Any]]) -> List[Tuple[Any, int]]:
    """
    Counts occurences of items in a list of lists
    and returns them in descending order as tuples <item, count>."""

    flat_list = [item for sublist in list_of_lists for item in sublist]
    counter = collections.Counter(flat_list)
    return counter.most_common(len(counter))


def get_value(df : pd.DataFrame, movie_id : str, col : str) -> Any:
    """Retrieves a value in a cell specified by `col` for movie with ID `movie_id`"""
    return df.loc[df['movie_id'] == m_id][col].values[0]

### Czech stemmer

In [3]:
#! /usr/bin/env python3.1
''' Czech stemmer
Copyright © 2010 Luís Gomes <luismsgomes@gmail.com>.

Ported from the Java implementation available at:
    http://members.unine.ch/jacques.savoy/clef/index.html

'''
import re
import sys



def cz_stem(word, aggressive=False):
    if not re.match("^\\w+$", word):
        return word
    if not word.islower() and not word.istitle() and not word.isupper():
        #print("warning: skipping word with mixed case: {}".format(word),
              #file=sys.stderr)
        return word
    s = word.lower() # all our pattern matching is done in lowercase
    s = _remove_case(s)
    s = _remove_possessives(s)
    if aggressive:
        s = _remove_comparative(s)
        s = _remove_diminutive(s)
        s = _remove_augmentative(s)
        s = _remove_derivational(s)
    if word.isupper():
        return s.upper()
    if word.istitle():
        return s.title()
    return s

def _remove_case(word):
    if len(word) > 7 and word.endswith("atech"):
        return word[:-5]
    if len(word) > 6:
        if word.endswith("ětem"):
            return _palatalise(word[:-3])
        if word.endswith("atům"):
            return word[:-4]
    if len(word) > 5:
        if word[-3:] in {"ech", "ich", "ích", "ého", "ěmi", "emi", "ému",
                         "ete", "eti", "iho", "ího", "ími", "imu"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"ách", "ata", "aty", "ých", "ama", "ami",
                         "ové", "ovi", "ými"}:
            return word[:-3]
    if len(word) > 4:
        if word.endswith("em"):
            return _palatalise(word[:-1])
        if word[-2:] in {"es", "ém", "ím"}:
            return _palatalise(word[:-2])
        if word[-2:] in {"ům", "at", "ám", "os", "us", "ým", "mi", "ou"}:
            return word[:-2]
    if len(word) > 3:
        if word[-1] in "eiíě":
            return _palatalise(word)
        if word[-1] in "uyůaoáéý":
            return word[:-1]
    return word

def _remove_possessives(word):
    if len(word) > 5:
        if word[-2:] in {"ov", "ův"}:
            return word[:-2]
        if word.endswith("in"):
            return _palatalise(word[:-1])
    return word

def _remove_comparative(word):
    if len(word) > 5:
        if word[-3:] in {"ejš", "ějš"}:
            return _palatalise(word[:-2])
    return word

def _remove_diminutive(word):
    if len(word) > 7 and word.endswith("oušek"):
        return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"eček", "éček", "iček", "íček", "enek", "ének",
                         "inek", "ínek"}:
            return _palatalise(word[:-3])
        if word[-4:] in {"áček", "aček", "oček", "uček", "anek", "onek",
                         "unek", "ánek"}:
            return _palatalise(word[:-4])
    if len(word) > 5:
        if word[-3:] in {"ečk", "éčk", "ičk", "íčk", "enk", "énk",
                         "ink", "ínk"}:
            return _palatalise(word[:-3])
        if word[-3:] in {"áčk", "ačk", "očk", "učk", "ank", "onk",
                         "unk", "átk", "ánk", "ušk"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"ek", "ék", "ík", "ik"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ák", "ak", "ok", "uk"}:
            return word[:-1]
    if len(word) > 3 and word[-1] == "k":
        return word[:-1]
    return word

def _remove_augmentative(word):
    if len(word) > 6 and word.endswith("ajzn"):
        return word[:-4]
    if len(word) > 5 and word[-3:] in {"izn", "isk"}:
        return _palatalise(word[:-2])
    if len(word) > 4 and word.endswith("ák"):
        return word[:-2]
    return word

def _remove_derivational(word):
    if len(word) > 8 and word.endswith("obinec"):
        return word[:-6]
    if len(word) > 7:
        if word.endswith("ionář"):
            return _palatalise(word[:-4])
        if word[-5:] in {"ovisk", "ovstv", "ovišt", "ovník"}:
            return word[:-5]
    if len(word) > 6:
        if word[-4:] in {"ásek", "loun", "nost", "teln", "ovec", "ovík",
                         "ovtv", "ovin", "štin"}:
            return word[:-4]
        if word[-4:] in {"enic", "inec", "itel"}:
            return _palatalise(word[:-3])
    if len(word) > 5:
        if word.endswith("árn"):
            return word[:-3]
        if word[-3:] in {"ěnk", "ián", "ist", "isk", "išt", "itb", "írn"}:
            return _palatalise(word[:-2])
        if word[-3:] in {"och", "ost", "ovn", "oun", "out", "ouš",
                         "ušk", "kyn", "čan", "kář", "néř", "ník",
                         "ctv", "stv"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {"áč", "ač", "án", "an", "ář", "as"}:
            return word[:-2]
        if word[-2:] in {"ec", "en", "ěn", "éř", "íř", "ic", "in", "ín",
                         "it", "iv"}:
            return _palatalise(word[:-1])
        if word[-2:] in {"ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn",
                         "dl", "nk", "tv", "tk", "vk"}:
            return word[:-2]
    if len(word) > 3 and word[-1] in "cčklnt":
        return word[:-1]
    return word

def _palatalise(word):
    if word[-2:] in {"ci", "ce", "či", "če"}:
        return word[:-2] + "k"

    if word[-2:] in {"zi", "ze", "ži", "že"}:
        return word[:-2] + "h"

    if word[-3:] in {"čtě", "čti", "čtí"}:
        return word[:-3] + "ck"

    if word[-3:] in {"ště", "šti", "ští"}:
        return word[:-3] + "sk"
    return word[:-1]

"""
if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"):
        sys.exit("usage: {} light|aggressive".format(sys.argv[0]))
    aggressive = sys.argv[1] == "aggressive"
    for line in sys.stdin:
        print(*[cz_stem(word, aggressive=aggressive)
                for word in line.split()])
"""
pass

# Load datasets

In [4]:
movies = pd.read_pickle(PROJ_ROOT + 'data/movies.pkl')
ratings = pd.read_pickle(PROJ_ROOT + 'data/ratings.pkl')


# Drop irrelevant columns
movies = movies.drop(columns=['kind', 'length', 'poster', 'foreign_titles'])

# Convert creators column to list of creators
movies['creators'] = movies['creators'].apply(creators2list)
#movies.reset_index(inplace=True) # causes trouble with join

display(movies.head(3))
ratings.head(3)

Unnamed: 0_level_0,title,description,genres,countries,year,creators
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
230421-houbicky,Houbičky,"Partička amerických teenagerů, která si vyrazi...","[Horor, Thriller]","[Irsko, Velká Británie, Dánsko]",2007,"[Lindsey Haun, Jack Huston, Max Kasch, Paddy B..."
10789-prvni-liga,První liga,V nejvyšší lize získávají hráči baseballu boha...,"[Komedie, Sportovní]",[USA],1989,"[Tom Berenger, Charlie Sheen, Corbin Bernsen, ..."
235032-yes-man,Yes Man,"Carl Allen je zatrpklý bankovní úředník, který...","[Komedie, Romantický]",[USA],2008,"[Jim Carrey, Zooey Deschanel, Bradley Cooper, ..."


Unnamed: 0,username,movie_id,stars,date,comment
0,kinghome,230421-houbicky,5.0,2011-11-13,Hodnocení některých šašků tady opravdu necháp...
1,SimonShot,230421-houbicky,5.0,2012-12-19,Tento snímek je zajímavý už jenom tím že se n...
2,blackend,230421-houbicky,5.0,2010-09-04,Pohoda a vzhledem k dobré atmosféře a nízkému...


In [5]:
# get mean rating of each film 
avg_ratings = ratings[['movie_id', 'stars']].groupby(['movie_id']).mean()
avg_ratings.rename(columns={'stars': 'avg_stars'}, inplace=True)

# get rating count of each film
count_ratings = ratings[['movie_id', 'stars']].groupby('movie_id').count()
count_ratings.rename(columns={'stars': 'num_ratings'}, inplace=True)
count_ratings

# join with the movies database
movie_db = movies.join(avg_ratings).join(count_ratings).sort_values(by='num_ratings')
movie_db = movie_db.reset_index()
movie_db

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
0,75613-hercule-poirot_498504-serie-9,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",5.000000,1
1,613789-obeti,Oběti,Na televizní obrazovku se vrací cyklus o lide...,[Drama],[Česko],(1999–2008),"[Zbyněk Fric, Karel Zima, Libor Žídek, Petr Sl...",5.000000,1
2,75613-hercule-poirot_498507-serie-12,Hercule Poirot,Malý belgický detektiv Hercule Poirot (David S...,"[Krimi, Drama, Mysteriózní, Thriller]",[Velká Británie],(1989–2013),"[David Suchet, Hugh Fraser, Philip Jackson, Ed...",4.500000,2
3,350930-krtek,Krtek,,"[Animovaný, Dobrodružný]","[Česko, Finsko]",2011,"[Zdeněk Miler, Wiliam Bukový]",4.500000,2
4,33863-bajaja,Bajaja,Jiří Trnka natočil v roce 1950 volně podle poh...,"[Animovaný, Loutkový, Pohádka]",[Československo],1950,"[Jiří Trnka, Václav Trojan]",4.285714,14
...,...,...,...,...,...,...,...,...,...
8669,232938-hobit-neocekavana-cesta,Hobit: Neočekávaná cesta,Film sleduje cestu hlavní postavy Bilbo Pytlík...,"[Dobrodružný, Fantasy]","[USA, Nový Zéland]",2012,"[Martin Freeman, Ian McKellen, Richard Armitag...",3.953150,3159
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368000,4000



# Recommendation Systems
---
## Base class

In [6]:
import abc

class RecSysBase(abc.ABC):
    """
    Recommendation System Base class.
    
    The system should store a database of available movies.
    
    """
    def __init__(self, movies_df : pd.DataFrame):
        self.movies_df = movies_df
    
    @property
    def name(self):
        return self.__class__.__name__

    @abc.abstractmethod
    def recommend(self, user_history : List[str], n : int = None) -> pd.DataFrame:
        """
        Generates recommendations based on user's history of rated movies.
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie ids a user has rated so far.
            
        n : int
            The number of recommendations to return.
            Default value None returns all movies from database.
        
        Returns
        -------
        pd.DataFrame
            movie_id ~ movie ID
            score_<class_name> ~ score of each movie
        """
        ...

    def get_candidate_movies(self, user_history : List[str]) -> pd.DataFrame:
        """Returns a list of candidates by filtering movies a user has already seen."""
        df = pd.concat([self.movies_df, self._ids2df(user_history, preserve_order=False)])
        # convert to str because drop_duplicates does not work on dfs containing structures
        return df.loc[df.astype(str).drop_duplicates(keep=False).index]

    def _ids2df(self, movie_ids : List[str], preserve_order : bool = True) -> pd.DataFrame:
        """For a list of movie ids returns a dataframe compatible with `self.movies_df`"""        
        if not preserve_order or len(movie_ids) == 0:
            # faster variant but sorted as in self.movies_df 
            mask = self.movies_df['movie_id'].isin(movie_ids)
            return self.movies_df.loc[mask]

        df_list = []
        for mid in movie_ids:
            df_list.append(self.movies_df[self.movies_df['movie_id'] == mid])
        return pd.concat(df_list)


## Random recommendations

In [7]:
import random

class RandomRS(RecSysBase):
    """
    System that recommends movies randomly.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(RandomRS, self).__init__(movies_df)

    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """
        Recommends n random movies.
        Movies from `user_history` do not appear in the recommendations.
        """
        recommendations = self.get_candidate_movies(user_history).sample(n)
        return recommendations if return_df else recommendations.movie_id
    


In [8]:
rnd_rs = RandomRS(movie_db)

rnd_rs.recommend([], 3, return_df=True)

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
8361,182363-million-dollar-baby,Million Dollar Baby,Mladá žena Maggie Fitzgeraldová osloví trenéra...,"[Sportovní, Drama]",[USA],2004,"[Clint Eastwood, Hilary Swank, Morgan Freeman,...",4.499584,1201
4564,2435-rostaci,Rošťáci,Parta dětí na cestě za pokladem pirátů. Dobrod...,"[Dobrodružný, Rodinný, Komedie]",[USA],1985,"[Sean Astin, Josh Brolin, Jeff Cohen, Richard ...",3.75,200
4021,244311-the-belko-experiment,Experiment Belko,Běžný den v kanceláři se promění v děsivý boj ...,"[Akční, Horor, Thriller]","[USA, Kolumbie]",2016,"[Tony Goldwyn, Michael Rooker, David Dastmalch...",3.245614,171


## Recommendation based on movie popularity

In [9]:

class PopularityRS(RecSysBase):
    """
    System recommends top n most popular movies a user has not rated yet.
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(PopularityRS, self).__init__(movies_df)

    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """
        Recommends n most rated movies a user has not yet seen.
        """
        recommendations = self.get_candidate_movies(user_history).sort_values(by='num_ratings').tail(n)
        return recommendations if return_df else recommendations.movie_id[0]
    

In [10]:
history = ['228329-avatar']

rs = PopularityRS(movie_db)

rs.recommend(history, 3, return_df=True)

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368,4000


## TF-IDF Model

In [11]:
import stop_words
from nltk.tokenize import word_tokenize
from multiprocessing import get_context, Pool
from gensim.corpora import Dictionary
from gensim.matutils import cossim
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from gensim.utils import simple_preprocess
from tqdm import tqdm


class TfidfRS(RecSysBase):
    """
    System recommends unseen movies using TF-IDF
    """
    def __init__(self, movies_df : pd.DataFrame):
        super(TfidfRS, self).__init__(movies_df)
        
        # Document == movie title & description
        documents = list(movies_df.title.astype(str) + ' ' + movies_df.description)
        
        # Build Dictionary
        with get_context('fork').Pool(None) as pool:
            doc_tokens = pool.imap(self.__class__._document_to_tokens, documents)
            doc_tokens = tqdm(doc_tokens, desc='Building the dictionary', total=len(documents))
            self.dictionary = Dictionary(doc_tokens)
            self.__class__.DICTIONARY = self.dictionary
        
        # Build TF-IDF model
        with get_context('fork').Pool(None) as pool:
            #doc_bows = pool.imap(self.__class__._tokens_to_bag_of_words, self.__class__.doc_tokens)
            doc_bows = pool.imap(self.__class__._document_to_bag_of_words, documents)
            doc_bows = tqdm(doc_bows, desc='Building the TF-IDF model', total=len(documents))
            self.tfidf_model = TfidfModel(doc_bows)
            self.__class__.TFIDF_MODEL = self.tfidf_model

        # Build the index
        with get_context('fork').Pool(None) as pool:    
            #doc_vectors = pool.imap(self.__class__._bow_to_tfidf_vector, self.doc_bows)
            doc_vectors = pool.imap(self.__class__._document_to_tfidf_vector, documents)
            doc_vectors = tqdm(doc_vectors, desc='Building the TF-IDF index', total=len(documents))
            self.index = SparseMatrixSimilarity(doc_vectors, num_docs=len(documents), num_terms=len(self.dictionary))
            
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL 

        self.index_to_movie_id = dict(enumerate(self.movies_df.movie_id.to_list()))

    
    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """Recommends unseen movies using tfidf vectors and cosine similarity.
        
        TODO: Is it better to yield movies one by one or return a list / DF?
        TODO: solve because `n` and `return_df` are currently not used
        
        Parameters
        ----------
        user_history : List[str]
            A list of movie IDs
        """
        # Take last k movies, join them into a single description and create a query.
        k = 50 # TODO: should this be a customizable parameter? or should it be tuned?
        last_k_movies = user_history[-k:]
        query = ' '.join([self.get_description_with_id(mid) for mid in last_k_movies])
        
        self.__class__.DICTIONARY = self.dictionary
        self.__class__.TFIDF_MODEL = self.tfidf_model

        query_vector = self.__class__._document_to_tfidf_vector(query)
        similarities = enumerate(self.index[query_vector])
        similarities = sorted(similarities, key=lambda item: item[1], reverse=True)

        for idx, sim in similarities:
            movie_id = self.index_to_movie_id[idx]
            if movie_id not in user_history:
                yield movie_id, sim
        
        del self.__class__.DICTIONARY
        del self.__class__.TFIDF_MODEL

    @classmethod
    def _document_to_tokens(cls, document: str) -> List[str]:
        """
        Filters stop words and applies stemming on the given string.

        Parameters
        ----------
        text : str
            A simple string containing text to preprocess.
        """
        tokens = word_tokenize(document.lower(), language='czech')
        filtered = [w.strip(",.:'?!()[]{}<>_") for w in document.split() if w not in set(stop_words.get_stop_words('czech'))]
        res = [cz_stem(word) for word in filtered]
        return res
    
    @classmethod
    def _document_to_bag_of_words(cls, document: str) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(cls._document_to_tokens(document))
    
    @classmethod
    def _document_to_tfidf_vector(cls, document: str) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[cls._document_to_bag_of_words(document)]

    @classmethod
    def _tokens_to_bag_of_words(cls, tokens: List[str]) -> List[Tuple[int, int]]:
        return cls.DICTIONARY.doc2bow(tokens)
    
    @classmethod
    def _bow_to_tfidf_vector(cls, bow: List[Tuple[int, int]]) -> List[Tuple[int, float]]:
        return cls.TFIDF_MODEL[bow]

In [12]:
rs_tfidf = TfidfRS(movie_db)

Building the dictionary: 100%|██████████| 8674/8674 [00:06<00:00, 1253.31it/s]
Building the TF-IDF model: 100%|██████████| 8674/8674 [00:05<00:00, 1484.43it/s]
Building the TF-IDF index: 100%|██████████| 8674/8674 [00:07<00:00, 1087.89it/s]


In [13]:
history = ['1644-kmotr', '1645-kmotr-ii', '9499-matrix', '342231-lego-batman']

for i, (movie_id, sim)  in enumerate(rs_tfidf.recommend(history)):
    print(f'{i+1}. {movie_id} ({sim:.4f})')
    if i == 15: break

AttributeError: 'TfidfRS' object has no attribute 'get_description_with_id'

## LSA Model

In [None]:
#TODO

## Embeedding approaches

### Headline embedding
generate headlines and put it to the root_dir/data/headlines.csv file

In [None]:
import sys
sys.path
sys.path.append('../../../../../Summarization') # Adam's meta repo

from transformers import GPT2LMHeadModel
# from utils import add_special_tokens, generate_one_summary_fast

In [None]:
# load model
model_path = "../../../../../Summarization/models_ext/noconotr3in1_tohead_eos_2021-10-18-11_26_58/checkpoint-181005"  # Adam's meta repo

model = GPT2LMHeadModel.from_pretrained(model_path)

# put model into eval mode and on device
model.eval()
device = 'cuda' # 'cpu' alternatively
model.to(device)

# load and set tokenizer
tokenizer_path = "../../../../../gpt2czech/tokenizer/hf/model50257_LBF/"  # Adam's meta repo
tokenizer = add_special_tokens(tokenizer_path)
tokenizer.model_max_length = 1024

In [None]:
%%time
# for each description get a headline
hdf = movie_db[["movie_id", "description"]]
headlines = []
for i in range(len(movie_db["description"]):
    input_seq = hdf.iloc[i]["description"]
    headlines.append(generate_one_summary_fast(input_seq, tokenizer, model, top_k=50, top_p=0.5, device=device, eos_stopping=True))

hdf = hdf.head(len(headlines))[["movie_id"]]
hdf["headline"] = headlines
hdf.to_csv("../../../data/headlines.csv")

generate embeddings to the headlines using Czert cased B

In [None]:
from transformers import AutoTokenizer, AutoModelForPreTraining
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("UWB-AIR/Czert-B-base-cased")
model = AutoModelForPreTraining.from_pretrained("UWB-AIR/Czert-B-base-cased")

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
None

In [None]:
df_headlines = pd.read_csv("../../../data/headlines.csv")
df_headlines.head(4)
df_headlines.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "embedding"], inplace=True)

In [None]:
%%time
def generate_sentence_embedding(sentence, model, tokenizer):
    # convert one headline to BERT input and generate embeddings
    marked_text = "[CLS] " + str(sentence) + " [SEP]"

    # Tokenize our sentence with the BERT tokenizer.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert to pytorh and cut the tensor to max len: 512 
    tokens_tensor = torch.tensor([indexed_tokens])[:, :512]
    segments_tensors = torch.tensor([segments_ids])[:, :512]
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors, output_hidden_states=True)
        hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

In [None]:
%%time
sent_embed = generate_sentence_embedding(df_headlines.headline.iloc[5], model, tokenizer)
sent_embed

In [None]:
df_headlines.loc[df_headlines.movie_id.str.contains("kmotr")]

In [227]:
# %%time
# for each film in headlines generate embeddings and save to headlines.csv
embeds = []
for h in tqdm(df_headlines.headline):
    embeds.append(generate_sentence_embedding(h, model, tokenizer).tolist())
df_headlines["embedding"] = embeds
df_headlines.to_csv("../../../data/headlines_test.csv")


  0%|          | 0/8674 [00:00<?, ?it/s][A
  0%|          | 1/8674 [00:00<19:07,  7.56it/s][A
  0%|          | 3/8674 [00:00<17:52,  8.09it/s][A
  0%|          | 5/8674 [00:00<17:02,  8.48it/s][A
  0%|          | 6/8674 [00:00<17:38,  8.19it/s][A
  0%|          | 7/8674 [00:00<18:06,  7.98it/s][A
  0%|          | 8/8674 [00:00<18:18,  7.89it/s][A
  0%|          | 9/8674 [00:01<19:34,  7.38it/s][A
  0%|          | 10/8674 [00:01<19:28,  7.42it/s][A
  0%|          | 11/8674 [00:01<36:30,  3.95it/s][A
  0%|          | 12/8674 [00:01<31:18,  4.61it/s][A
  0%|          | 13/8674 [00:02<28:31,  5.06it/s][A
  0%|          | 14/8674 [00:02<24:36,  5.86it/s][A
  0%|          | 15/8674 [00:02<23:52,  6.04it/s][A
  0%|          | 16/8674 [00:02<22:20,  6.46it/s][A
  0%|          | 17/8674 [00:02<21:58,  6.56it/s][A
  0%|          | 18/8674 [00:02<22:25,  6.43it/s][A
  0%|          | 19/8674 [00:02<21:31,  6.70it/s][A
  0%|          | 20/8674 [00:02<19:47,  7.29it/s][A
  0%|   

KeyboardInterrupt: 

In [236]:
import ast

class EmbedResSys(RecSysBase):
    """
    System recommends unseen movies using embeddings
    
    csv_embed_path ... string path to a dataframe mapping movie_id to embedding
    """
    def __init__(self, movies_df : pd.DataFrame, csv_embed_path : str = None):        
        self.movies_df = movies_df
        if csv_embed_path is None:
#             super(EmbedResSys, self).__init__(movies_df)
            self.device = "cuda"
            self.prepare_model_tokenizer()           
            self.df_embed = self.generate_embeddings()
        else:
            self.df_embed = pd.read_csv(csv_embed_path)
            self.df_embed.embedding = self.df_embed.embedding.apply(ast.literal_eval)
            
        # pick those rows of the df_embed that correspond to the movies_df
        self.df_embed = pd.merge(movies_df, self.df_embed, how='inner')[["movie_id", "embedding"]]
    
        # target matrix of all embeddings for getting the similarity 
        self.target = torch.tensor(list(self.df_embed.embedding))
        
        self.index_to_movie_id = dict(enumerate(movies_df.movie_id.to_list()))
    
    
    def recommend(self, user_history : List[str], n : int = 10, return_df : bool = False) -> List[str]:
        """Recommends unseen movies using embeddings from headlines.

        Parameters
        ----------
        user_history : List[str]
            A list of movie IDs

        n : int
            Recommends n movies, if set to an integer.
            Default value None returns all the movies.
        """
        # Take last k movies, join them into a single description and create a query.
        k = 50 # TODO: should this be a customizable parameter? or should it be tuned?
        last_k_movies = user_history[-k:]
        # mean of the embeddings of th last k movies
        query = torch.mean(torch.tensor([self.df_embed.loc[self.df_embed.movie_id == m]["embedding"].iloc[0] for m in user_history]), dim=0)
        
        similarities = list(enumerate(self.get_similarities(query)))
        similarities = sorted(similarities, key=lambda item: item[1].item(), reverse=True)

        result_ids, scores = [], []
        for idx, sim in similarities:
            movie_id = self.index_to_movie_id[idx]
            if movie_id not in user_history:
                result_ids.append(movie_id)
                scores.append(sim.item())
            
            if len(result_ids) == n:
                break

        return pd.DataFrame({
            'movie_id': result_ids,
            f'score_{self.name}': scores
        })
    
    
    def generate_embeddings(self, save_dir=f"../../../data"):
        df_embed = self.generate_descriptors()
        
        embeds = []
        for h in tqdm(df_embed.descriptor):
            embeds.append(self.generate_one_embedding(h))
        df_embed["embedding"] = embeds
        
        if save_dir:
            df_embed.to_csv(save_dir+f"/{self.name}_embeddings.csv")
        
        return df_embed
    
    
    def generate_one_embedding(self, sentence):
        # convert one descriptor to BERT input and generate embeddings
        marked_text = "[CLS] " + str(sentence) + " [SEP]"

        # Tokenize our sentence with the BERT tokenizer.
        tokenized_text = self.tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the 22 tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert to pytorh and cut the tensor to max len: 512 
        tokens_tensor = torch.tensor([indexed_tokens])[:, :512].to(self.device)
        segments_tensors = torch.tensor([segments_ids])[:, :512].to(self.device)

        # Run the text through BERT, and collect all of the hidden states produced
        # from all 12 layers. 
        with torch.no_grad():
            outputs = self.model(tokens_tensor, segments_tensors, output_hidden_states=True)
            hidden_states = outputs[2]

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(hidden_states, dim=0)

        # Remove dimension 1, the "batches".
        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)

        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = hidden_states[-2][0]

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)

        return sentence_embedding.tolist()

    
    def get_similarities(self, query):
        return torch.nn.CosineSimilarity(dim=-1)(query, self.target)
    
    
    def prepare_model_tokenizer(self):
        self.model = AutoModelForPreTraining.from_pretrained("UWB-AIR/Czert-B-base-cased")
        self.tokenizer = AutoTokenizer.from_pretrained("UWB-AIR/Czert-B-base-cased")
        device = 'cuda' # 'cpu' alternatively
        self.model.to(self.device)
        self.model.eval()
    
    # conerts string representation of list to a torch tensor
    def string_to_torch(self, string):
        return torch.tensor([float(x) for x in string[1:-1].split(",")])

    
    def batch(self, iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]
    
    @abc.abstractmethod
    def generate_descriptors(self):
        pass

In [1]:
from tqdm import tqdm
import sys
sys.path
# sys.path.append('../../../../../Summarization')
# from utils import add_special_tokens, generate_one_summary_fast

# from utils import add_special_tokens, generate_one_summary_fast       #TODO
from transformers import AutoTokenizer, AutoModelForPreTraining
import torch

class HeadlineResSys(EmbedResSys):
    """
    System recommends unseen movies using embeddings
    
    csv_embed_path ... string path to a dataframe mapping movie_id to embedding
    """
    def __init__(self, movies_df : pd.DataFrame, csv_embed_path : str = None):  
     
        super(HeadlineResSys, self).__init__(movies_df, csv_embed_path=csv_embed_path)        

        
    def generate_descriptors(self):
        # load model
        model = AutoModelForCausalLM.from_pretrained("MU-NLPC/CzeGPT-2")

        # put model into eval mode and on device
        model.eval()
        device = 'cuda' # 'cpu' alternatively
        model.to(device)

        # load and set tokenizer
        tokenizer = AutoTokenizer.from_pretrained("MU-NLPC/CzeGPT-2")
        special_tokens = {'sep_token':'<|sep|>'}
        tokenizer.add_special_tokens(special_tokens)
        tokenizer.model_max_length = 1024
        
        # for each description get a descriptor
        descriptors = []
        for i in tqdm(range(len(self.movies_df["description"]))):
            input_seq = movies_df.iloc[i]["description"]
            descriptors.append(generate_one_summary_fast(input_seq, tokenizer, model, top_k=50, top_p=0.5, device=device, eos_stopping=True))

        hdf = movies_df.head(len(descriptors))[["movie_id"]]
        hdf["descriptor"] = descriptors
        return hdf

NameError: name 'EmbedResSys' is not defined

### First sentence embedding

This recommender takes the name of the movie and the first sentence, concatenates it and creates the embedding.

In [238]:
import torch
from nltk.tokenize import sent_tokenize

class FirstSentResSys(EmbedResSys):
    """
    System recommends unseen movies using embeddings
    
    csv_embed_path ... string path to a dataframe mapping movie_id to embedding
    """
    def __init__(self, movies_df : pd.DataFrame, csv_embed_path : str = None):  
     
        super(FirstSentResSys, self).__init__(movies_df, csv_embed_path=csv_embed_path)        

        
    def generate_descriptors(self):        
        # for each description get a descriptor
        first_sent = self.movies_df.description.apply(lambda x: " ".join(sent_tokenize(x)[:1]))
        descriptors = list(self.movies_df.title.astype(str) + ' ' + first_sent)

        hdf = self.movies_df[["movie_id"]]
        hdf["descriptor"] = descriptors
        return hdf

In [240]:
%%time
rcmnd = FirstSentResSys(movie_db, csv_embed_path="../../../data/FirstSentResSys_embeddings.csv")

CPU times: user 22.5 s, sys: 230 ms, total: 22.8 s
Wall time: 22.8 s


In [None]:
rcmnd = HeadlineResSys(movie_db, "../../../data/HeadlineResSys_embeddings.csv")

In [221]:
user_history = ["308348-pribeh-kmotra", "1645-kmotr-ii"]

In [241]:
rcmnd.recommend(user_history, 6)

Unnamed: 0,movie_id,score_FirstSentResSys
0,12392-magnum-force,0.950929
1,7093-podezreni,0.949556
2,5317-cintamani-podvodnik,0.947838
3,5395-mechanicky-pomeranc,0.945152
4,271458-zeme-bez-zakona,0.943781
5,5914-tenkrat-v-americe,0.943566


In [199]:
frst_df = pd.read_csv("../../../data/FirstSentResSys_embeddings.csv")

In [209]:
%%time
target = torch.tensor(list(frst_df.embedding.apply(ast.literal_eval)))

CPU times: user 22.8 s, sys: 168 ms, total: 22.9 s
Wall time: 22.9 s


In [210]:
target

tensor([[ 0.2351,  0.1156,  0.6456,  ..., -0.1213, -0.3023, -0.1100],
        [ 0.2682,  0.1230,  0.1604,  ..., -0.3904, -0.5052,  0.1072],
        [ 0.2351,  0.1156,  0.6456,  ..., -0.1213, -0.3023, -0.1100],
        ...,
        [ 0.4401, -0.0586, -0.1574,  ...,  0.0682, -0.3331, -0.2376],
        [ 0.2497,  0.0128, -0.1463,  ..., -0.2329, -0.4552, -0.1454],
        [ 0.1440,  0.1955, -0.6020,  ..., -0.0032, -0.5883, -0.1310]])

In [235]:
%%time
[movie_db.loc[movie_db.movie_id == m]["movie_id"].iloc[0] for m in user_history]
?list(pd.merge(movie_db, pd.DataFrame({"movie_id": user_history}), how='inner')[["movie_id"]]["movie_id"])

Object `list(pd.merge(movie_db, pd.DataFrame({"movie_id": user_history}), how='inner')[["movie_id"]]["movie_id"])` not found.
CPU times: user 7.08 ms, sys: 327 µs, total: 7.4 ms
Wall time: 6.41 ms


In [233]:
# try to generate with BERT using batches
tokenizer = AutoTokenizer.from_pretrained("UWB-AIR/Czert-B-base-cased")
model = AutoModelForPreTraining.from_pretrained("UWB-AIR/Czert-B-base-cased")
model.to("cuda")
model.eval()
None

In [168]:
%%time
device = "cuda"
model.to(device)
tokens_tensors = torch.stack([torch.randint(0,10000, (512,))] *1).to(device)
print(tokens_tensors.shape)
segments_tensors = torch.tensor([512*[1]]*1).to(device)
print(segments_tensors.shape)
with torch.no_grad():
    outputs = model(tokens_tensors, segments_tensors, output_hidden_states=True)
    hidden_states = outputs[2]

torch.Size([1, 512])
torch.Size([1, 512])
CPU times: user 82.9 ms, sys: 9.13 ms, total: 92 ms
Wall time: 86.7 ms


In [165]:
hidden_states[0].shape

torch.Size([32, 512, 768])

In [111]:
    def generate_one_embedding(self, sentence):
        # convert one descriptor to BERT input and generate embeddings
        marked_text = "[CLS] " + str(sentence) + " [SEP]"

        # Tokenize our sentence with the BERT tokenizer.
        tokenized_text = self.tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the 22 tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert to pytorh and cut the tensor to max len: 512 
        tokens_tensor = torch.tensor([indexed_tokens])[:, :512]
        segments_tensors = torch.tensor([segments_ids])[:, :512]

        # Run the text through BERT, and collect all of the hidden states produced
        # from all 12 layers. 
        with torch.no_grad():
            outputs = self.model(tokens_tensor, segments_tensors, output_hidden_states=True)
            hidden_states = outputs[2]

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(hidden_states, dim=0)

        # Remove dimension 1, the "batches".
        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)

        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = hidden_states[-2][0]

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)

        return sentence_embedding.tolist()

Unnamed: 0,movie_id
0,75613-hercule-poirot_498504-serie-9
1,613789-obeti
2,75613-hercule-poirot_498507-serie-12
3,350930-krtek
4,33863-bajaja
...,...
8669,232938-hobit-neocekavana-cesta
8670,227786-interstellar
8671,223734-temny-rytir
8672,254156-pocatek


---
# Playground

In [108]:
documents

0       Hercule Poirot Malý belgický detektiv Hercule ...
1       Oběti Na televizní obrazovku se vrací cyklus  ...
2       Hercule Poirot Malý belgický detektiv Hercule ...
3                                                  Krtek 
4       Bajaja Jiří Trnka natočil v roce 1950 volně po...
                              ...                        
8669    Hobit: Neočekávaná cesta Film sleduje cestu hl...
8670    Interstellar Příběh se odehrává v nepříliš vzd...
8671    Temný rytíř Další Batmanovo dobrodružství začí...
8672    Počátek Dom Cobb (Leonardo DiCaprio) je velmi ...
8673    Avatar Avatar před námi otevírá neuvěřitelný s...
Length: 8674, dtype: object

In [193]:
df_matrix = movie_db[movie_db['movie_id'].str.contains('matrix')].tail(3)
df_kmotr = movie_db[movie_db['title'].str.contains('Kmotr')].tail(3)
df_nolan = movie_db[movie_db.creators.map(set(['Christopher Nolan']).issubset)].tail(3)

test_df = pd.concat([df_matrix, df_kmotr, df_nolan])
test_df

Unnamed: 0,movie_id,title,description,genres,countries,year,creators,avg_stars,num_ratings
8018,9498-matrix-revolutions,Matrix Revolutions,Ve výbušné závěrečné kapitole trilogie Matrix ...,"[Akční, Sci-Fi]","[USA, Austrálie]",2003,"[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",3.435927,874
8093,9497-matrix-reloaded,Matrix Reloaded,Druhé pokračování trilogie nás opět zavede do...,"[Akční, Sci-Fi]","[USA, Austrálie]",2003,"[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",3.713519,932
8628,9499-matrix,The Matrix,"Za vším hledej Matrix. Zdál se vám někdy sen, ...","[Akční, Sci-Fi]",[USA],1999,"[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",4.685115,2096
7675,1646-kmotr-iii,Kmotr III,Člověk financí a politiky. Al Pacino v hlavní ...,"[Drama, Krimi]",[USA],1990,"[Al Pacino, Diane Keaton, Talia Shire, Francis...",4.309144,689
8083,1645-kmotr-ii,Kmotr II,Jsme větší než U. S. Steel. Al Pacino a Robert...,"[Drama, Krimi]",[USA],1974,"[Al Pacino, Robert Duvall, Diane Keaton, Franc...",4.490811,925
8614,1644-kmotr,Kmotr,Kmotr je příběhem newyorské mafiánské rodiny C...,"[Drama, Krimi]",[USA],1972,"[Marlon Brando, Al Pacino, James Caan, Francis...",4.594678,1954
8670,227786-interstellar,Interstellar,Příběh se odehrává v nepříliš vzdálené budoucn...,"[Sci-Fi, Dobrodružný, Drama]","[USA, Velká Británie, Kanada]",2014,"[Matthew McConaughey, Anne Hathaway, Jessica C...",4.072779,3174
8671,223734-temny-rytir,Temný rytíř,Další Batmanovo dobrodružství začíná. Jeho pro...,"[Akční, Drama, Krimi, Thriller]","[USA, Velká Británie]",2008,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",4.559151,3770
8672,254156-pocatek,Počátek,Dom Cobb (Leonardo DiCaprio) je velmi zkušený ...,"[Akční, Sci-Fi, Thriller, Mysteriózní, Dobrodr...","[USA, Velká Británie]",2010,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",4.368,4000


In [262]:
rs_test2 = HeadlineResSys(test_df, "../../../data/headlines_list.csv")

In [263]:
for movie, sim in rs_test2.recommend(['1644-kmotr']):
    print(f'{sim:.4f}: {movie}')

KeyError: 8614

In [166]:
for movie, sim in rs_test2.recommend(['9499-matrix']):
    print(f'{sim:.4f}: {movie}')

0.1377: 9498-matrix-revolutions
0.0734: 9497-matrix-reloaded
0.0241: 1646-kmotr-iii
0.0179: 1645-kmotr-ii
0.0156: 254156-pocatek
0.0144: 223734-temny-rytir
0.0138: 227786-interstellar
0.0126: 1644-kmotr


In [167]:
#!pip install pyspark
#!pip install spark-nlp

In [None]:
import sparknlp
from sparknlp.pretrained import LemmatizerModel

lemmatizer = LemmatizerModel.pretrained("lemma", "cs")\
                .setInputCols(["token"])\
                .setOutputCol("lemma")

In [None]:
# !pip install simplemma

In [None]:
import simplemma

langdata = simplemma.load_data('cs')

for w in ['žebráci', 'žebral']:
    lemma = simplemma.lemmatize(w, langdata)
    stem = cz_stem(lemma)
    print(f'{w} -> {lemma} -> {stem}')

In [None]:
genre_popularity = movie_db.genres.explode().value_counts().sort_values(ascending=False)
genre_popularity

In [None]:
import wordcloud
import matplotlib.pyplot as plt 

item_count_list_genres = analyse_freq(movies['genres'])
genres, genre_counts = [list(t) for t in zip(*item_count_list_genres)]

wc = wordcloud.WordCloud(background_color='white')
wc.generate_from_frequencies(dict(zip(genres, genre_counts)))

plt.imshow(wc)

In [250]:
df_headlines2 = pd.read_csv("../../../data/headlines_list.csv")

In [251]:
df_headlines2

Unnamed: 0.1,Unnamed: 0,movie_id,headline,embedding
0,0,75613-hercule-poirot_498504-serie-9,Scotland Yard - nový detektiv?,"[-0.5625672340393066, -0.47168660163879395, 0...."
1,1,613789-obeti,Česká televize,"[0.1335097700357437, 0.41292598843574524, 0.61..."
2,2,75613-hercule-poirot_498507-serie-12,Scotland Yard v novém kabátě,"[-0.7068173885345459, -0.2617079019546509, 0.3..."
3,3,350930-krtek,že jsem se narodil,"[0.46434155106544495, 0.07884159684181213, 0.0..."
4,4,33863-bajaja,Česká pohádka Bajaja se vrátila do Prahy,"[0.06880209594964981, 0.19066403806209564, 0.7..."
...,...,...,...,...
8669,8669,232938-hobit-neocekavana-cesta,Easter eggs: Bilbo Pytlík,"[-0.050285954028367996, 0.13156619668006897, 0..."
8670,8670,227786-interstellar,Ulice 1169-1708: Červí díra,"[0.585819661617279, 0.2834097743034363, 0.4016..."
8671,8671,223734-temny-rytir,Ordinace v růžové zahradě 39: Poslední kapka,"[0.2733287811279297, -0.38519376516342163, 1.0..."
8672,8672,254156-pocatek,Ukradl myšlenky a teď se mu nabízí možnost vyk...,"[0.0461638942360878, 0.14224864542484283, 0.08..."


In [248]:

a = "[-3.2247e-01, -1.6037e-01, -2.7747e-02,  2.2188e-01,  2.2052e-01,\n        -3.9928e-01, -3.4213e-01, -4.7073e-01,  3.7822e-01, -5.2162e-01,\n         2.2698e-02, -2.6662e-01,  9.6309e-02, -7.1487e-01, -4.7563e-01,\n         1.8311e-01,  1.3808e-02,  6.4929e-01,  5.8477e-02,  1.0346e-01,\n        -6.8578e-01,  6.7290e-01, -5.1883e-01, -1.9852e-01,  2.9400e-02,\n         4.0206e-01,  9.8511e-02, -2.7132e-01,  2.0048e-01, -7.0647e-02,\n         5.6653e-01,  5.1893e-01, -2.5024e-01,  4.2888e-01, -5.9884e-03,\n        -3.1353e-01,  7.6915e-01,  7.7918e-01,  1.9785e-01, -6.7552e-01,\n         4.6639e-02,  1.0574e+00, -9.5063e-01, -1.4573e-01, -1.6618e-01,\n        -7.8475e-02, -1.4015e-01, -2.4147e+00, -5.7294e-01,  9.1124e-01,\n        -4.8365e-01, -3.2236e-01, -2.8448e-01, -1.1636e-02, -9.7421e-01,\n        -5.9065e-01,  1.6998e-01, -6.0986e-01, -1.5445e-01, -2.2779e-01,\n         4.7322e-01, -2.0147e-01,  5.8252e-01, -6.5007e-02, -2.8424e-01,\n        -3.0399e-01, -1.9859e-01]"
[float(x) for x in a[1:-1].split(",")]

['-3.2247e-01',
 ' -1.6037e-01',
 ' -2.7747e-02',
 '  2.2188e-01',
 '  2.2052e-01',
 '\n        -3.9928e-01',
 ' -3.4213e-01',
 ' -4.7073e-01',
 '  3.7822e-01',
 ' -5.2162e-01',
 '\n         2.2698e-02',
 ' -2.6662e-01',
 '  9.6309e-02',
 ' -7.1487e-01',
 ' -4.7563e-01',
 '\n         1.8311e-01',
 '  1.3808e-02',
 '  6.4929e-01',
 '  5.8477e-02',
 '  1.0346e-01',
 '\n        -6.8578e-01',
 '  6.7290e-01',
 ' -5.1883e-01',
 ' -1.9852e-01',
 '  2.9400e-02',
 '\n         4.0206e-01',
 '  9.8511e-02',
 ' -2.7132e-01',
 '  2.0048e-01',
 ' -7.0647e-02',
 '\n         5.6653e-01',
 '  5.1893e-01',
 ' -2.5024e-01',
 '  4.2888e-01',
 ' -5.9884e-03',
 '\n        -3.1353e-01',
 '  7.6915e-01',
 '  7.7918e-01',
 '  1.9785e-01',
 ' -6.7552e-01',
 '\n         4.6639e-02',
 '  1.0574e+00',
 ' -9.5063e-01',
 ' -1.4573e-01',
 ' -1.6618e-01',
 '\n        -7.8475e-02',
 ' -1.4015e-01',
 ' -2.4147e+00',
 ' -5.7294e-01',
 '  9.1124e-01',
 '\n        -4.8365e-01',
 ' -3.2236e-01',
 ' -2.8448e-01',
 ' -1.1636