In [1]:
from recsys.models.retrieval import DeepRetriever
from recsys.external_datasets import Movielens_1M
from recsys.datasets import InteractionsDataset
from pytorch_lightning import Trainer
from recsys.models.scoring import NCF
import pandas as pd
import torch
import numpy as np
from recsys.layers import retrieve_nearest_neighbors

In [2]:
data = Movielens_1M()
ratings, users, movies = data.load()

# Data processing

In [3]:
#Preprocess users
users['gender'], uniques = pd.factorize(users['gender'])
users['occupation'], uniques = pd.factorize(users['occupation'])
users['zip'], uniques = pd.factorize(users['zip'])
##Set category dtype
users['gender'] = users.gender.astype('category')
users['occupation'] = users.occupation.astype('category')
users['zip'] = users.zip.astype('category')

#Preprocess movies, 
##categories to index
movies['title'], uniques = pd.factorize(movies['title'])
movies['genres'], uniques = pd.factorize(movies['genres'])
##Set category dtype
movies['title'] = movies.title.astype('category')
movies['genres'] = movies.genres.astype('category')

#Make all ratings an implicit interaction
ratings["rating"] = ratings["rating"].apply(lambda x: 1 if x>=3 else 0)

In [4]:
dataset = InteractionsDataset(ratings, users, movies, item_id="movie_id", interaction_id="rating", sample_negatives=3)

# Retrieval Step

In [5]:
retriever = DeepRetriever(dataset.data_schema)

In [6]:
retriever.fit(dataset=dataset, num_epochs=1)

Epoch: 0/1, Loss: 24.83: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.19s/it]


In [7]:
item_alias, item_representations = retriever.generate_item_representations(dataset)
user_alias, user_representations = retriever.generate_user_representations(dataset)

  items_features = torch.tensor(items_features)


In [8]:
nearest_neighbors_idx = retrieve_nearest_neighbors(candidates=item_representations, query=user_representations[0]).tolist()
retrieved_items = item_alias[nearest_neighbors_idx]
retrieved_items

tensor([  46, 3244, 3650, 1602, 3273, 3035, 3612, 1432, 1289, 1496,  376, 2983,
         805, 3714])

# Scoring step

In [9]:
scorer = NCF(dataset.data_schema)

In [10]:
scorer.fit(dataset=dataset, num_epochs=1)

Epoch: 0/1, Loss: 0.38: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.72s/it]


In [11]:
user = user_alias[:10].tolist()
items = retrieved_items[:10].tolist()

user_features = dataset.get_user_features(user)
item_features = dataset.get_item_features(items)

pairs = torch.tensor([user, items]).T
user_features = torch.tensor(user_features)
items_features = torch.tensor(item_features)

scorer.predict(pairs, user_features, items_features)

tensor([[-1.0200],
        [-0.7300],
        [-4.9884],
        [-3.0428],
        [-0.4006],
        [ 0.0354],
        [-2.2178],
        [-1.5980],
        [-0.5185],
        [-2.1092]], grad_fn=<AddmmBackward0>)

In [12]:


import numpy as np

a = np.array([1,2,3,np.nan, 5])
b = np.array([1,2,3,np.nan, 5, np.nan])

In [14]:
set(b)

{nan, 1.0, 2.0, 3.0, nan, 5.0}

In [15]:
b[~np.isnan(b)]

array([1., 2., 3., 5.])

In [16]:
d = {1:1, 2:2}
np.vectorize(d.get)(a)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [17]:
# filter out nans 

TypeError: set expected at most 1 argument, got 2