In [2]:
import sys
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel

In [4]:
import sys
sys.path.append("..")

In [88]:
from src.model import TitleDescriptionHead

In [13]:
# BERT (https://arxiv.org/pdf/1810.04805.pdf)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# bert = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
model = torch.load("../scripts/checkpoint/097_0.7962.pth")

In [36]:
device = "cuda:2" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = model.eval().to(device)

Using cuda:2 device


In [55]:
df = pd.read_csv("../data/wines_seeds_complete.csv", index_col=0)

In [None]:
td_head = TitleDescriptionHead()

In [None]:
prev_seed = ""
ratings, targets = [], []

with torch.no_grad():
    for i, seed in df.iterrows():
        if seed["seed title"] == prev_seed:
            continue

        prev_seed = seed["seed title"]
        rating, target = [], []

        for j, reco in df.iterrows():
            seed_t, seed_d = seed["seed title"], seed["seed description"]
            reco_t, reco_d = reco["recommended title"], reco["recommended description"]

            seed_tokens = tokenizer(
                [seed_t], [seed_d], return_special_tokens_mask=True, return_token_type_ids=True, 
                padding=True, truncation=True, return_attention_mask=True, return_tensors="pt"
            )

            special_tokens, attn_mask = seed_tokens["special_tokens_mask"].to(device), seed_tokens["attention_mask"].to(device)
            input_ids, token_types = seed_tokens["input_ids"].to(device), seed_tokens["token_type_ids"].to(device)

            seed_out = model.forward(input_ids=input_ids, attn_mask=attn_mask, special_tokens=special_tokens, token_types=token_types)
            seed_ft, seed_fd = seed_out["f_t"], seed_out["f_d"]
            
            reco_tokens = tokenizer(
                [reco_t], [reco_d], return_special_tokens_mask=True, return_token_type_ids=True, 
                padding=True, truncation=True, return_attention_mask=True, return_tensors="pt"
            )

            special_tokens, attn_mask = reco_tokens["special_tokens_mask"].to(device), reco_tokens["attention_mask"].to(device)
            input_ids, token_types = reco_tokens["input_ids"].to(device), reco_tokens["token_type_ids"].to(device)

            reco_out = model.forward(input_ids=input_ids, attn_mask=attn_mask, special_tokens=special_tokens, token_types=token_types)
            reco_ft, reco_fd = reco_out["f_t"], reco_out["f_d"]

            td_tokens = tokenizer(
                [seed_t], [reco_d], return_special_tokens_mask=True, return_token_type_ids=True, 
                padding=True, truncation=True, return_attention_mask=True, return_tensors="pt"
            )

            special_tokens, attn_mask = td_tokens["special_tokens_mask"].to(device), td_tokens["attention_mask"].to(device)
            input_ids, token_types = td_tokens["input_ids"].to(device), td_tokens["token_type_ids"].to(device)

            td_out = model.forward(input_ids=input_ids, attn_mask=attn_mask, special_tokens=special_tokens, token_types=token_types)
            td_sim = td_out["cos_sim"]

            dt_tokens = tokenizer(
                [reco_t], [seed_d], return_special_tokens_mask=True, return_token_type_ids=True, 
                padding=True, truncation=True, return_attention_mask=True, return_tensors="pt"
            )

            special_tokens, attn_mask = dt_tokens["special_tokens_mask"].to(device), dt_tokens["attention_mask"].to(device)
            input_ids, token_types = dt_tokens["input_ids"].to(device), dt_tokens["token_type_ids"].to(device)

            dt_out = model.forward(input_ids=input_ids, attn_mask=attn_mask, special_tokens=special_tokens, token_types=token_types)
            dt_sim = dt_out["cos_sim"]

            tt_sim = td_head.forward(seed_ft, reco_ft)
            dd_sim = td_head.forward(seed_fd, reco_fd)

            total = sum([td_sim, dt_sim, tt_sim, dd_sim])
            label = seed["seed title"] == reco["seed title"]

            rating.append(total)
            target.append(label)

        ratings.append(rating)
        targets.append(target)

        if i > 0 and i % 10 == 0:
            print(f"Row {i} processed...")

In [97]:
r = torch.tensor(ratings)
y = torch.tensor(targets)

In [98]:
r_sort, idxs = torch.sort(r, descending=True)
y_sort = torch.gather(y, dim=1, index=idxs)

In [99]:
k = 10
top_k = y_sort[:, :k]

In [100]:
torch.sum(top_k).item() / y.shape[1]

0.5189873417721519

In [103]:
torch.save(r, "../data/wine_scores.pt")
torch.save(y, "../data/wine_labels.pt")

In [94]:
df.to_pickle("../data/wines_seeds_embeds.pickle")