In [1]:
import json
import logging
import os
from argparse import Namespace

import click
import torch
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from transformers import WEIGHTS_NAME

from luke.luke_utils.entity_vocab import MASK_TOKEN

from luke.utils import set_seed
from luke.utils.trainer import Trainer, trainer_args
from luke.model import LukeForRelationClassification
from luke.re_utils import HEAD_TOKEN, TAIL_TOKEN, convert_examples_to_features, DatasetProcessor
from transformers.tokenization_roberta import RobertaTokenizer

import numpy as np

In [2]:
from types import SimpleNamespace
metadata_folder = "luke/luke_model/"

class obj(object):
    def __init__(self, d):
        for a, b in d.items():
            if isinstance(b, (list, tuple)):
               setattr(self, a, [obj(x) if isinstance(x, dict) else x for x in b])
            else:
               setattr(self, a, obj(b) if isinstance(b, dict) else b)

with open(os.path.join(metadata_folder, "metadata.json")) as f:
    model_config = obj(json.load(f)["model_config"])

print(model_config.vocab_size)

50265


In [3]:
class params:
    def __init__(self, model_config):
        self.data_dir = "luke/data/tacred/json"
        self.do_train = "--no-train"
        self.train_batch_size = 4
        self.num_train_epochs = 5.0
        self.do_val = "--no-eval"
        self.eval_batch_size = 128
        self.seed = 42
        self.bert_model_name = "roberta-large"
        self.max_mention_length = 30
        self.local_rank = -1
        self.tokenizer =  RobertaTokenizer.from_pretrained(self.bert_model_name)
        self.model_config = model_config
        self.model_weights = {"embeddings.word_embeddings.weight":0.25, "entity_embeddings.entity_embeddings.weight":0.25}
        
#         self.tokenizer = {"max_len": 512, "bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>", "init_inputs": []}


In [4]:
args = params(model_config)
args.tokenizer.pad_token_id
logger = logging.getLogger(__name__)


        


In [5]:
def load_and_cache_examples(args, fold="train"):

    processor = DatasetProcessor()
    if fold == "train":
        examples = processor.get_train_examples(args.data_dir)
    elif fold == "dev":
        examples = processor.get_dev_examples(args.data_dir)
    else:
        examples = processor.get_test_examples(args.data_dir)

    label_list = processor.get_label_list(args.data_dir)

    bert_model_name = args.bert_model_name

    cache_file = os.path.join(
        args.data_dir,
        "cached_" + "_".join((args.bert_model_name.split("-")[0], str(args.max_mention_length), fold)) + ".pkl",
    )
    if os.path.exists(cache_file):
        logger.info("Loading features from cached file %s", cache_file)
        features = torch.load(cache_file)
    else:
        logger.info("Creating features from dataset file")
        features = convert_examples_to_features(examples, label_list, args.tokenizer, args.max_mention_length)

        if args.local_rank in (-1, 0):
            torch.save(features, cache_file)

    
    def collate_fn(batch):
        def create_padded_sequence(attr_name, padding_value):
            tensors = [torch.tensor(getattr(o, attr_name), dtype=torch.long) for o in batch]
            return torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True, padding_value=padding_value)

        return dict(
            word_ids=create_padded_sequence("word_ids", args.tokenizer.pad_token_id),
            word_attention_mask=create_padded_sequence("word_attention_mask", 0),
            word_segment_ids=create_padded_sequence("word_segment_ids", 0),
            entity_ids=create_padded_sequence("entity_ids", 0),
            entity_attention_mask=create_padded_sequence("entity_attention_mask", 0),
            entity_position_ids=create_padded_sequence("entity_position_ids", -1),
            entity_segment_ids=create_padded_sequence("entity_segment_ids", 0),
            label=torch.tensor([o.label for o in batch], dtype=torch.long),
        )

    if fold in ("dev", "test"):
        dataloader = DataLoader(features, batch_size=args.eval_batch_size, shuffle=False, collate_fn=collate_fn)
    else:
        if args.local_rank == -1:
            sampler = RandomSampler(features)
        else:
            sampler = DistributedSampler(features)
        dataloader = DataLoader(features, sampler=sampler, batch_size=args.train_batch_size, collate_fn=collate_fn)

    return dataloader, examples, features, label_list

In [6]:
# args.model_config.vocab_size += 2
# word_emb = args.model_weights["embeddings.word_embeddings.weight"]
# head_emb = word_emb[args.tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
# tail_emb = word_emb[args.tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
# args.model_weights["embeddings.word_embeddings.weight"] = torch.cat([word_emb, head_emb, tail_emb])
# args.tokenizer.add_special_tokens(dict(additional_special_tokens=[HEAD_TOKEN, TAIL_TOKEN]))

# entity_emb = args.model_weights["entity_embeddings.entity_embeddings.weight"]
# mask_emb = entity_emb[args.entity_vocab[MASK_TOKEN]].unsqueeze(0).expand(2, -1)
# args.model_config.entity_vocab_size = 3
# args.model_weights["entity_embeddings.entity_embeddings.weight"] = torch.cat([entity_emb[:1], mask_emb])


In [7]:
dataloader, examples, features, label_list = load_and_cache_examples(args)

In [8]:
type(dataloader), type(examples), type(features), type(label_list)

(torch.utils.data.dataloader.DataLoader, list, list, list)

In [9]:
np.array(examples).shape, np.array(features).shape, np.array(label_list).shape, 

((68124,), (68124,), (42,))

In [10]:
print(label_list[0:20])

['no_relation', 'org:alternate_names', 'org:city_of_headquarters', 'org:country_of_headquarters', 'org:dissolved', 'org:founded', 'org:founded_by', 'org:member_of', 'org:members', 'org:number_of_employees/members', 'org:parents', 'org:political/religious_affiliation', 'org:shareholders', 'org:stateorprovince_of_headquarters', 'org:subsidiaries', 'org:top_members/employees', 'org:website', 'per:age', 'per:alternate_names', 'per:cause_of_death']


In [11]:
args.tokenizer.convert_tokens_to_ids(["@"])[0]

1039

In [12]:
num_labels = len(label_list)
model_weights= [0.34,0.33,0.33]

In [13]:
model = LukeForRelationClassification(args, num_labels)
model.load_state_dict(model_weights, strict=False)
model.to(args.device)

num_train_steps_per_epoch = len(train_dataloader) // args.gradient_accumulation_steps
num_train_steps = int(num_train_steps_per_epoch * args.num_train_epochs)

best_dev_f1 = [-1]
best_weights = [None]

def step_callback(model, global_step):
    if global_step % num_train_steps_per_epoch == 0 and args.local_rank in (0, -1):
        epoch = int(global_step / num_train_steps_per_epoch - 1)
        dev_results = evaluate(args, model, fold="dev")
        args.experiment.log_metrics({f"dev_{k}_epoch{epoch}": v for k, v in dev_results.items()}, epoch=epoch)
        results.update({f"dev_{k}_epoch{epoch}": v for k, v in dev_results.items()})
        tqdm.write("dev: " + str(dev_results))

        if dev_results["f1"] > best_dev_f1[0]:
            if hasattr(model, "module"):
                best_weights[0] = {k: v.to("cpu").clone() for k, v in model.module.state_dict().items()}
            else:
                best_weights[0] = {k: v.to("cpu").clone() for k, v in model.state_dict().items()}
            best_dev_f1[0] = dev_results["f1"]
            results["best_epoch"] = epoch

        model.train()

trainer = Trainer(
    args, model=model, dataloader=train_dataloader, num_train_steps=num_train_steps, step_callback=step_callback
)
trainer.train()


AttributeError: 'obj' object has no attribute 'chunk_size_feed_forward'

In [28]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/pretrain_model')

In [32]:
# modelA = TheModelAClass(*args, **kwargs)
model = torch.load("luke/luke_model/luke.bin",map_location=torch.device('cpu'))
model
# model = torch.load("C:/prabhu/edu/code/w266/Luke/model/luke_20200528.tar")
# writer.add_graph(model)
# writer.close()

OrderedDict([('encoder.layer.0.attention.self.query.weight',
              tensor([[-0.0029,  0.0352,  0.0007,  ...,  0.0023,  0.0595, -0.0426],
                      [-0.0248,  0.0529, -0.0145,  ..., -0.0303, -0.0143,  0.0116],
                      [ 0.0061,  0.0708, -0.0336,  ...,  0.0807,  0.0115, -0.0131],
                      ...,
                      [-0.0589,  0.0206, -0.0426,  ..., -0.0298,  0.0041,  0.0700],
                      [ 0.0421,  0.0225, -0.0608,  ..., -0.0552, -0.0157,  0.0173],
                      [-0.0184, -0.0457, -0.0103,  ...,  0.0474,  0.0225, -0.0182]])),
             ('encoder.layer.0.attention.self.query.bias',
              tensor([ 0.3121,  0.0556, -0.0751,  ..., -0.0704, -0.0500, -0.0664])),
             ('encoder.layer.0.attention.self.key.weight',
              tensor([[-0.0043, -0.0184, -0.0136,  ..., -0.0037,  0.0096, -0.0156],
                      [-0.0238, -0.0002,  0.0253,  ...,  0.0403,  0.0436, -0.0195],
                      [-0.0264, -0