In [11]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import re
import numpy as np
import tensorflow_recommenders as tfrs
import tensorflow as tf

In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")
data = pd.read_csv("ml-25m/movies.csv")

Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def remove_pars(x):
    x = str(x)
    return re.sub('[()]', "", x)

titles = [remove_pars(i) for i in data['title']]

def remove_pipes(x):
    x = str(x)
    return re.sub('\|', " ", x)

genres = [remove_pipes(i) for i in data['genres']]

def remove_nulls(a, b, i):
    string_m = a[i] + " " + b[i]
    return re.sub("\(no genres listed\)", "", string_m)

input_string = [remove_nulls(titles, genres, i) for i in range(len(genres))]

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

embeddings_list = []
for _, i in enumerate(input_string):
    encoded_input = tokenizer(i, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    embeddings_list.append(embeddings)
    if _ % 10000  == 0:
        print(str(_))

0
10000
20000
30000
40000
50000
60000


In [7]:
embeddings_list_tensors = []
for i in embeddings_list:
    d = i.cpu()[0].numpy()
    embeddings_list_tensors.append(d)

embeddings = pd.DataFrame(np.vstack(embeddings_list_tensors))
embeddings.to_csv("data.csv")

In [12]:
item_tensor = tf.convert_to_tensor(embeddings, dtype=tf.float32)


In [21]:
%%time
scann = tfrs.layers.factorized_top_k.ScaNN(num_leaves=1000, num_leaves_to_search = 100, k = round(np.sqrt(len(item_tensor))))
scann.index(item_tensor)

print("All done!")

All done!
CPU times: user 1min 53s, sys: 1.88 s, total: 1min 55s
Wall time: 31.4 s


In [23]:
test = "Horror films with zombies"
encoded_input = tokenizer(test, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
with torch.no_grad():
    model_output = model(**encoded_input)
query = model_output.pooler_output
query = torch.nn.functional.normalize(embeddings)

In [25]:
test_case = scann(np.array(query.cpu()))

In [26]:
data.iloc[test_case[1].numpy()[0]][0:9]

Unnamed: 0,movieId,title,genres
11068,47980,Bio Zombie (Sun faa sau si) (1998),Comedy|Horror
13822,71535,Zombieland (2009),Action|Comedy|Horror
46049,171651,Redneck Zombies (1989),Horror
23643,118810,Zombie Women of Satan (2009),Comedy|Horror
45150,169738,Zombie Wars (2006),Horror
55180,191327,Teenage Zombies (1960),Horror|Sci-Fi
41540,161912,Zombie Night (2003),Comedy|Horror|Sci-Fi
23642,118808,Zombie Reanimation (2009),Action|Comedy|Horror
14427,75404,ZMD: Zombies of Mass Destruction (2009),Comedy|Horror
