## Experiment with encoder

In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small')
model = AutoModel.from_pretrained('intfloat/e5-small')

In [3]:
input_text = 'query: how much protein should a female eat'
tokenizer.decode(tokenizer(input_text)['input_ids'])

'[CLS] query : how much protein should a female eat [SEP]'

## Load data

In [4]:
def concate_all(example):
    query = 'query: ' + example['query']
    rels = [example['positive']] + example['negatives']
    rels = ['relation: ' + rel for rel in rels]
    example['input_text'] = [query] + rels
    return example

In [5]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small')
def tokenize(example):
    tokenized = tokenizer(example['input_text'], padding='max_length', truncation=True, return_tensors='pt', max_length=32)
    return tokenized

In [6]:
from datasets import load_dataset

train = load_dataset('json', data_files='data/retrieval/train_.jsonl')['train']

train = train.map(concate_all, remove_columns=train.column_names)

train = train.map(tokenize, remove_columns=train.column_names)

Found cached dataset json (/home/sakamoto/.cache/huggingface/datasets/json/default-1f8abf35be282deb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/sakamoto/.cache/huggingface/datasets/json/default-1f8abf35be282deb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-fd7c7dca12058bd8.arrow
Loading cached processed dataset at /home/sakamoto/.cache/huggingface/datasets/json/default-1f8abf35be282deb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-ded0bab52891bbeb.arrow


In [7]:
import torch

In [8]:
from dataclasses import dataclass
from collections import defaultdict
from transformers import PreTrainedTokenizerBase

@dataclass
class Collator:
    tokenizer: PreTrainedTokenizerBase
    
    def __call__(self, features):
        batched = defaultdict(list)
        for item in features:
            for key, value in item.items():
                value = torch.tensor(value)
                if key == 'attention_mask':
                    value = value.bool()
                batched[key].append(value)
        for key, value in batched.items():
            batched[key] = torch.stack(value, dim=0)
        return batched

In [9]:
from torch.utils.data import DataLoader
loader = DataLoader(train, batch_size=2, shuffle=True, collate_fn=Collator(tokenizer=tokenizer))

In [10]:
batch = next(iter(loader))
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
batch['input_ids'].shape

torch.Size([2, 17, 32])

## Test Model

In [12]:
import lightning.pytorch as pl
from scorer.encoder import LitSentenceEncoder


In [13]:
model = LitSentenceEncoder('intfloat/e5-small')

In [14]:
model.model.state_dict().keys()

odict_keys(['embeddings.position_ids', 'embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.laye

In [None]:
trainer = pl.Trainer(accelerator='cpu', fast_dev_run=True)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [None]:
trainer.fit(model, loader)

  rank_zero_warn(

  | Name    | Type       | Params
---------------------------------------
0 | model   | BertModel  | 33.4 M
1 | loss_fn | NTXentLoss | 0     
---------------------------------------
33.4 M    Trainable params
0         Non-trainable params
33.4 M    Total params
133.440   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


## Compare similarity

In [2]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained('intfloat/e5-small')
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small')
query = 'The Eiffel Tower is in Paris'
positive = 'The Eiffel Tower is in France'
text_pairs = [query, positive]
tokenized = tokenizer(text_pairs,padding=True, return_tensors='pt')
print(tokenized['input_ids'].shape)

torch.Size([2, 10])


In [3]:
embeddings = model(**tokenized)
embeddings.last_hidden_state.shape

torch.Size([2, 10, 384])

In [4]:
def average_pool(last_hidden_states, attention_mask):
    """Average pool the sentence embedding
    Ref: huggingface.co/intfloat/e5-large
    """
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [5]:
pooled = average_pool(embeddings.last_hidden_state, tokenized['attention_mask'])

In [11]:
import torch.nn.functional as F

similarity = F.cosine_similarity(pooled[0], pooled[1], dim=-1)
similarity

tensor(0.9843, grad_fn=<SumBackward1>)

In [9]:
similarity.item()

0.9842543601989746

## Load the lightning model and convert it to huggingface model

In [1]:
from scorer.encoder import LitSentenceEncoder

In [2]:
lit_model = LitSentenceEncoder('intfloat/e5-base')

In [3]:
# load form lightning checkpoint
lit_model = LitSentenceEncoder.load_from_checkpoint('~/Downloads/srtk/scorer-ce/lightning_logs/version_0/checkpoints/epoch=61-step=88784.ckpt',
                                                    model_name_or_path='intfloat/e5-base')

In [4]:
lit_model.save_huggingface_model('artifacts/scorer-ce-apr11/')

In [5]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('artifacts/scorer/')

## Duplicated add to a pyvis graph

When we add duplicated nodes to a pyvis graph, will the latest config like color overwrite the previous one?
Will the unmentioned properties like label keep the same as the previous one?

## Convert the Freebase training data to our format

### WebQSP

In [1]:
freebase_train = '/home/sakamoto/Documents/Projects/subgraph-ruc/tmp/retriever/e2e_train_data.csv'

In [7]:
import csv
import srsly

freebase_samples = []
with open(freebase_train) as f:
    reader = csv.reader(f)
    for row in reader:
        sample = {
            'query': row[0],
            'positive': row[1],
            'negatives': row[2:]
        }
        freebase_samples.append(sample)
srsly.write_jsonl('data/preprocess/train_freebase.jsonl', freebase_samples)
        

In [8]:
ground_path = '/home/sakamoto/Documents/Projects/subgraph-ruc/tmp/preprocessing/step0.json'

In [9]:
!head -n 1 $ground_path | jq

[1;39m{
  [0m[34;1m"question"[0m[1;39m: [0m[0;32m"what is the name of justin bieber brother"[0m[1;39m,
  [0m[34;1m"topic_entities"[0m[1;39m: [0m[1;39m[
    [0;32m"m.06w2sn5"[0m[1;39m
  [1;39m][0m[1;39m,
  [0m[34;1m"answers"[0m[1;39m: [0m[1;39m[
    [0;32m"m.0gxnnwq"[0m[1;39m
  [1;39m][0m[1;39m
[1;39m}[0m


In [11]:
grounds = srsly.read_jsonl(ground_path)
converted_grounds = []
for i, ground in enumerate(grounds):
    sample = {
        'id': 'webqsp-' + str(i),
        'question': ground['question'],
        'question_entities': ground['topic_entities'],
        'answer_entities': ground['answers'],
    }
    converted_grounds.append(sample)
srsly.write_jsonl('data/preprocess/ground_webqsp.jsonl', converted_grounds)

## Evaluate the RUC's result

In [1]:
test_retrieval_path = '/home/sakamoto/Documents/Projects/subgraph-ruc/tmp/reader_data/webqsp/test_simple.json'

In [None]:
!head -n 1 $test_retrieval_path | jq

In [37]:
test_retrieval = srsly.read_jsonl(test_retrieval_path)
hit = 0
not_hit = 0
for sample in test_retrieval:
    answers = [ans['kb_id'] for ans in sample['answers']]
    entities = sample['subgraph']['entities']
    if any([entity in answers for entity in entities]):
        hit += 1
    else:
        not_hit += 1
print(f'{hit} / {hit + not_hit} = {hit / (hit + not_hit)}')

344 / 1639 = 0.20988407565588774


## Convert to RUC's format so that I can retrieve subgraph using RUC's code

**NO NEED**

Every sample is a json object like this:
```json
{
  "id": "WebQTrn-9",
  "question": "how old is sacha baron cohen",
  "entities": [
    99399
  ],
  "answers": [
    {
      "kb_id": "1971-10-13",
      "text": null
    }
  ]
}
```

In [10]:
# Check the original format
original_path = '/home/sakamoto/Documents/Projects/subgraph-ruc/tmp/data/origin_nsm_data/webqsp/train_simple.json'
target_path = '/home/sakamoto/Documents/Projects/subgraph-ruc/tmp/data/ground/webqsp100/train_simple.json'

In [11]:
!head -n 100 $original_path > $target_path