In [52]:
import numpy as np
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import recmetrics

from tqdm.notebook import tqdm

# load data

In [2]:
moviesid=json.load(open('data/movies.json'))
movies = pd.read_csv("data/ml-latest/movies.csv").set_index('movieId')
columns=movies.loc[moviesid]['title'].to_list()

In [3]:
pivot = np.load('data/pivot.npy')
_, test_pivot, _, _ = train_test_split(pivot, range(len(pivot)), random_state=23)
del(pivot)

# load model

In [4]:
cols = json.load(open("model/columns.json"))
corrdf = pd.DataFrame(np.load("model/corr.npy"), columns=cols, index=cols)
corrdf=corrdf[~corrdf.index.duplicated(keep='first')]

In [5]:
def get_recommendation(movie_name, user_rating):
    sim = corrdf[movie_name]*(user_rating-2.5)
    return sim.sort_values(ascending=False)

get_recommendation("Avengers: Infinity War - Part I (2018)", 4)

Avengers: Infinity War - Part I (2018)     1.500000
Avengers: Infinity War - Part II (2019)    1.168674
Thor: Ragnarok (2017)                      1.154720
Guardians of the Galaxy 2 (2017)           1.016170
Untitled Spider-Man Reboot (2017)          1.015677
                                             ...   
Dead Man Walking (1995)                   -0.157351
Shakespeare in Love (1998)                -0.160158
Get Shorty (1995)                         -0.168311
Four Weddings and a Funeral (1994)        -0.176492
Dances with Wolves (1990)                 -0.190805
Name: Avengers: Infinity War - Part I (2018), Length: 7613, dtype: float64

In [13]:
def get_multirec(tuples):
    return pd.concat([get_recommendation(movie_name, user_rating) for movie_name, user_rating in tuples if user_rating!= 0], axis=1).sum(1).sort_values(ascending=False).index.to_list()

get_multirec([("Avengers: Infinity War - Part I (2018)", 4), ('Untitled Spider-Man Reboot (2017)', 1)])

['Avengers: Infinity War - Part I (2018)',
 'Avengers: Infinity War - Part II (2019)',
 'Deadpool 2 (2018)',
 'Joker (2019)',
 'Thor: Ragnarok (2017)',
 'Interstellar (2014)',
 'Wolf of Wall Street, The (2013)',
 'Parasite (2019)',
 'The Martian (2015)',
 'Inception (2010)',
 'Blade Runner 2049 (2017)',
 'Knives Out (2019)',
 'Green Book (2018)',
 'Dune (2021)',
 '1917 (2019)',
 'Django Unchained (2012)',
 'Ex Machina (2015)',
 'Arrival (2016)',
 'Big Short, The (2015)',
 'Dark Knight Rises, The (2012)',
 'Ford v. Ferrari (2019)',
 'Shutter Island (2010)',
 'Guardians of the Galaxy (2014)',
 'Tenet (2020)',
 'Dark Knight, The (2008)',
 'Inglourious Basterds (2009)',
 'A Quiet Place (2018)',
 'Once Upon a Time in Hollywood (2019)',
 'Edge of Tomorrow (2014)',
 'Mad Max: Fury Road (2015)',
 'Whiplash (2014)',
 'Deadpool (2016)',
 'Avengers, The (2012)',
 'John Wick: Chapter 3 – Parabellum (2019)',
 'The Gentlemen (2020)',
 'The Imitation Game (2014)',
 'Bohemian Rhapsody (2018)',
 'The H

# evaluation

In [43]:
testdf=pd.DataFrame(test_pivot, columns=columns)

In [44]:
X_cols, y_cols, _, _ = train_test_split(columns, columns, test_size=0.5)

In [50]:
X=testdf.iloc[:100][X_cols].to_dict('records')
y=testdf.iloc[:100][y_cols].to_dict('records')

  X=testdf.iloc[:100][X_cols].to_dict('records')


In [53]:
def predict(records):
    return [get_multirec(tuple(record.items())) for record in tqdm(records)]

def prep(records):
    return [sorted(record.keys(), key=record.get, reverse=True) for record in tqdm(records)]

h = predict(X)
truth = prep(y)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [57]:
recmetrics.recommender_recall(truth, h)

0.5001999999999999

In [55]:
recmetrics.mark(truth, h, k=100)

Help on module recmetrics.metrics in recmetrics:

NAME
    recmetrics.metrics

FUNCTIONS
    catalog_coverage(predicted: List[list], catalog: list, k: int) -> float
        Computes the catalog coverage for k lists of recommendations
        Parameters
        ----------
        predicted : a list of lists
            Ordered predictions
            example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
        catalog: list
            A list of all unique items in the training data
            example: ['A', 'B', 'C', 'X', 'Y', Z]
        k: integer
            The number of observed recommendation lists
            which randomly choosed in our offline setup
        Returns
        ----------
        catalog_coverage:
            The catalog coverage of the recommendations as a percent
            rounded to 2 decimal places
        ----------    
        Metric Defintion:
        Ge, M., Delgado-Battenfeld, C., & Jannach, D. (2010, September).
        Beyond accuracy: evaluating recommender s