In [1]:

from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
%cd /content/gdrive/MyDrive/Anime_Recommender/Greta_CE

/content/gdrive/MyDrive/Anime_Recommender/Greta_CE


In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import CrossEncoder
from pathlib import Path
import csv
import pickle
import time
import sys
import pandas as pd
import torch
import torch.nn.functional as F
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Dataset we are to use
dataset_path = "./data/animes.csv"


In [6]:
# File for caching  embeddings
embedding_cache_path = 'anime-embeddings.pkl'
df_cache_path = 'df-anime.pkl'
embedding_cache_path

'anime-embeddings.pkl'

In [7]:
# BiEncoder (SentenceTransformer) that produces embeddings for input request and uses cosine similarity to filter top num_candidates similar synposes.
bi_encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
num_candidates = 500
max_corpus_size = 20000

In [8]:
#Mean Pooling - Take attention mask into account for correct averaging - NOT in use
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [9]:
#Check if embedding cache path exists. If not, embed synopses using BiEncoder
cached_file = Path(embedding_cache_path)

if cached_file.exists():
    df = pd.read_pickle(df_cache_path)  
    corpus_sentences = df['synopsis'].tolist()
    print("Load pre-computed embeddings from disc")
    with open(embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
    corpus_embeddings = cache_data['embeddings'][0:max_corpus_size]

else:
    df = pd.read_csv(dataset_path)
    df.dropna(subset=['synopsis'], inplace=True)
    df.drop_duplicates(subset='synopsis', keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)

    corpus_sentences = df['synopsis'].tolist()

    """
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model_test = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Tokenize sentences
    encoded_input = tokenizer(corpus_sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model_test(**encoded_input)

    # Perform pooling. In this case, max pooling.
    corpus_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    """
    
    print("Encoding synopses")
    corpus_embeddings = bi_encoder_model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)

    print("Store file on disc")
    df.to_pickle(df_cache_path)  
    with open(embedding_cache_path, "wb") as fOut:
        pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut) 


Load pre-computed embeddings from disc


In [10]:
print(len(df))

15191


In [11]:
print(f"{len(df)} sentences / {len(corpus_embeddings)} embeddings configured")

15191 sentences / 15191 embeddings configured


In [12]:
# Use the filtered num_candidates synopses as inout to CrossEncoder. A CrossEncoder gets both inputs (input request, synopsis) and outputs similarity score
CE_model = 'cross-encoder/stsb-roberta-base'
cross_encoder_model = CrossEncoder(CE_model)

In [13]:
def cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1).to(device))

In [None]:
while True:
    inp_request = input("Please enter a request: ")
    print("Input request:", inp_request)

    #First, filter num_candidates using bi-encoder
    start_time = time.time()
    question_embedding = bi_encoder_model.encode(inp_request, convert_to_tensor=True)
    
    #Get top 5 results from bi-encoder
    #semantic_search performs similarity followed by topk filtering
    encoder_res = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates)[0]
        #Check that corpus and queries are on the same device
    # if corpus_embeddings.device != question_embedding.device:
    #     query_embeddings = question_embedding.to(corpus_embeddings.device)
    # cos_scores = cos_sim(question_embedding, corpus_embeddings)[0]

    # top_results = torch.topk(cos_scores, k=num_candidates)



    print("Cosine-Similarity search took {:.3f} seconds".format(time.time()-start_time))
    print("Top 5 hits with cosine-similarity:")
    for res in encoder_res[0:5]:
        # print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))
        print("\t{:.3f}\t{}".format(res['score'], df['title'].iloc[res['corpus_id']]))
        # print("\t",df['title'].iloc[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))

    #Use filtered results form bi-encoder as input to cross-encoder
    start_time = time.time()
    sentence_pairs = [[inp_request, corpus_sentences[res['corpus_id']]] for res in encoder_res] #pair all summaries with input request
    ce_scores = cross_encoder_model.predict(sentence_pairs)


    for i in range(len(encoder_res)):
        encoder_res[i]['cross-encoder_score'] = ce_scores[i]

    #Sort CrossEncoder output results by scores
    encoder_res = sorted(encoder_res, key=lambda x: x['cross-encoder_score'], reverse=True)
    print("\nRe-ranking with Cross-Encoder took {:.3f} seconds".format(time.time() - start_time))
    print("Top 5 hits with CrossEncoder:")
    for res in encoder_res[0:5]:
        # print("\t{:.3f}\t{}".format(hit['cross-encoder_score'], corpus_sentences[hit['corpus_id']]))
        print("\t{:.3f}\t{}".format(res['cross-encoder_score'], df['title'].iloc[res['corpus_id']]))
    print("\n\n========\n")

Please enter a request: Historical fiction about vikings and revenge
Input request: Historical fiction about vikings and revenge
Cosine-Similarity search took 0.504 seconds
Top 5 hits with cosine-similarity:
	0.633	Vinland Saga
	0.459	Himekishi Angelica
	0.446	Sword Art Online: Alicization - War of Underworld 2nd Season
	0.432	Shadow Skill: Kurudaryuu Kousatsuhou no Himitsu
	0.429	Doupo Cangqiong 2nd Season Specials

Re-ranking with Cross-Encoder took 7.346 seconds
Top 5 hits with CrossEncoder:
	0.565	Kami nomi zo Shiru Sekai
	0.534	Tales of Phantasia The Animation
	0.510	Chuan Shu Zijiu Zhinan
	0.508	Feng Yun Jue
	0.490	Arslan Senki



