In [46]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# %%
import plyvel
import pickle
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import datasets
import torch
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from pywikidata import Entity
from pywikidata.utils import get_wd_search_results

import editdistance

In [43]:
# db = plyvel.DB(
#     'aliases_lvldb',
#     create_if_missing=False,
#     lru_cache_size=1024,
#     bloom_filter_bits=256,
# )


# property_label_to_id = {}
# for key, val in tqdm(db):
#     key = key.decode()
#     if 'P' != key[0]:
#         continue
    
#     property_label_to_id[Entity(key).label] = key
    
#     val = pickle.loads(val)
#     for label in val.get('en', []):
#         property_label_to_id[label] = key
        
        
# import json
# with open('properties_with_aliases_to_id.json', 'w') as f:
#     json.dump(property_label_to_id, f)

In [62]:
import json
with open('properties_with_aliases_to_id.json', 'r') as f:
    property_label_to_id = json.load(f)

In [3]:
df = pd.read_csv('./train_label_generation.csv')
train_df, valid_df = train_test_split(df, test_size=0.2)

ds = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(train_df),
    'validation': datasets.Dataset.from_pandas(valid_df),
    'test': datasets.Dataset.from_csv('./rubq_paper_res_sel.csv'),
})

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 8490.49it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 311.10it/s]
Generating train split: 1186 examples [00:00, 97968.42 examples/s]


In [4]:
model_path = "./data/masha_experiment_property_labeling/t5-large-ssm/checkpoint-3000/"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [7]:
model = model.cuda()

In [5]:
label_pad_token_id = -100

def _preprocess_function(
    examples,
    ignore_pad_token_for_loss=label_pad_token_id,
):
    inputs = examples['question']

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding='max_length',
        truncation=True,
    )
    
    if 'relation_label' in examples:
        targets = examples['relation_label']
        # Tokenize targets with text_target=...
        labels = tokenizer(
            text_target=targets,
            max_length=512, 
            padding='max_length',
            truncation=True,
        )
        # If we are padding here, replace all tokenizer.pad_token_id
        # in the labels by ignore_pad_token when we want to ignore
        # padding in the loss.
        if ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [
                    (
                        lbl
                        if lbl != tokenizer.pad_token_id
                        else ignore_pad_token_for_loss
                    )
                    for lbl in label
                ]
                for label in labels["input_ids"]
            ]

        model_inputs["labels"] = labels["input_ids"]
    return model_inputs

ds = ds.map(
    _preprocess_function,
    batched=True,
    num_proc=8,
    desc="Running tokenizer on train dataset",
)

Running tokenizer on train dataset (num_proc=8): 100%|██████████| 18178/18178 [00:02<00:00, 6556.20 examples/s] 
Running tokenizer on train dataset (num_proc=8): 100%|██████████| 4545/4545 [00:00<00:00, 4659.84 examples/s]
Running tokenizer on train dataset (num_proc=8): 100%|██████████| 1186/1186 [00:00<00:00, 3497.99 examples/s]


In [65]:
def get_property_by_label(prop_label):
    # Try to find exact match by SPARQL request
    generated_props = []
    try:
        for prop in Entity.from_label(prop_label):
            if prop.is_property:
                generated_props.append(prop.idx)
    except ValueError:
        pass
    if len(generated_props) > 0:
        return generated_props[0]
    
    # If no, looking most similar by edit distance
    min_dist = 1e18
    current_prop = None
    for key, val in property_label_to_id.items():
        dist = editdistance.eval(prop_label, key)
        if dist < min_dist:
            min_dist = dist
            current_prop = val
        if dist == 0:
            return current_prop
    return current_prop


In [73]:
results = []
results_generated = []
for inp in tqdm(ds['test']['input_ids']):
    outputs = model.generate(torch.tensor([inp]).cuda(), max_length=512)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    results.append(get_property_by_label(generated_text[0]))
    results_generated.append(generated_text[0])
    
test_df = ds['test'].to_pandas()
test_df['predicted'] = results
test_df['predicted_raw'] = results_generated
test_df[['uid', 'predicted_raw', 'predicted']].to_csv('predicted_properties_RuBQ_t5llargessm.csv', index=False)

100%|██████████| 1186/1186 [01:56<00:00, 10.17it/s]


# Mintaka

In [74]:
mintaka_test_ds = datasets.load_dataset('AmazonScience/mintaka', name='en', split='test')
mintaka_test_ds = mintaka_test_ds.map(
    _preprocess_function,
    batched=True,
    num_proc=8,
    desc="Running tokenizer on train dataset",
)


results = []
results_generated = []
for inp in tqdm(mintaka_test_ds['input_ids']):
    outputs = model.generate(torch.tensor([inp]).cuda(), max_length=512)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    results.append(get_property_by_label(generated_text[0]))
    results_generated.append(generated_text[0])
    
test_df = mintaka_test_ds.to_pandas()
test_df['predicted'] = results
test_df['predicted_raw'] = results_generated
test_df[['id', 'predicted_raw', 'predicted']].to_csv('predicted_properties_MintakaFull_t5llargessm.csv', index=False)

  0%|          | 8/4000 [00:02<20:24,  3.26it/s]

100%|██████████| 4000/4000 [13:34<00:00,  4.91it/s]


# SQWD

In [75]:
!wget https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_test_answerable.txt

--2023-11-20 15:15:30--  https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_test_answerable.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 345052 (337K) [text/plain]
Saving to: ‘annotated_wd_data_test_answerable.txt’


2023-11-20 15:15:31 (2.49 MB/s) - ‘annotated_wd_data_test_answerable.txt’ saved [345052/345052]



In [84]:
sqwd_test_ds = datasets.Dataset.from_pandas(pd.read_csv(
    'annotated_wd_data_test_answerable.txt',
    sep='\t',
    names=['subject', 'property', 'object', 'question']
))
sqwd_test_ds = sqwd_test_ds.map(
    _preprocess_function,
    batched=True,
    num_proc=8,
    desc="Running tokenizer on train dataset",
)

results = []
results_generated = []
for inp in tqdm(sqwd_test_ds['input_ids']):
    outputs = model.generate(torch.tensor([inp]).cuda(), max_length=512)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    results.append(get_property_by_label(generated_text[0]))
    results_generated.append(generated_text[0])
    
test_df = sqwd_test_ds.to_pandas()
test_df['predicted'] = results
test_df['predicted_raw'] = results_generated
test_df[['question', 'predicted_raw', 'predicted']].to_csv('predicted_properties_annotated_wd_data_test_answerable_t5llargessm.csv', index=False)

Running tokenizer on train dataset (num_proc=8): 100%|██████████| 5622/5622 [00:00<00:00, 9231.30 examples/s] 
100%|██████████| 5622/5622 [11:18<00:00,  8.29it/s]
