In [1]:
!pip install transformers==4.24 cache_decorator pytorch_lightning==1.6.3 torchmetrics==0.7.0



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/history')

In [4]:
import os
import json
import torch
import re
import spacy
from tqdm.notebook import tqdm
from paragraph_models_utils import (
    load_data, preprocess_data
)
from paragraph_models import (
    MultiTaskLearningModel, WikiDataModule,
)
from event_models_utils import (
    get_rams_data_dict, load_rams_data,
    get_event_names_dict, load_ontology,
    template2tokens
)
from event_models import (
    EventGenModelWrapper, ArgumentModelWrapper,
    RAMSEventGenDataModule, RAMSArgumentDataModule
)
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    BartModel, BartTokenizer,
    BertModel, BertTokenizer, BertTokenizerFast
)

seed_everything(42, workers=True)

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


42

In [5]:
evt2sent, sent2evt, _, _ = get_event_names_dict()
docs, dicts = load_rams_data()
evt2idx = dicts["evt2idx"]
ontology_dict = load_ontology()

In [6]:
paragraph_tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
evt_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
evt_tokenizer.add_tokens([" <arg>", " <trg>", " <evt>"])
bart1 = BartModel.from_pretrained("facebook/bart-base")
bart2 = BartModel.from_pretrained("facebook/bart-base")
bart_tokenizer1 = BartTokenizer.from_pretrained("facebook/bart-base")
bart_tokenizer2 = BartTokenizer.from_pretrained("facebook/bart-base")

In [7]:
with open(
    "/content/drive/MyDrive/history/datasets/wiki/wiki_dataset.json",
    encoding="utf-8"
) as f_in:
    data = json.load(f_in)

In [8]:
paragraph_model = MultiTaskLearningModel().load_from_checkpoint(
    "/content/drive/MyDrive/history/checkpoints/mtl/epoch=9-step=6920.ckpt"
)
evt_model = EventGenModelWrapper(
    bart=bart1,
    bart_tokenizer=bart_tokenizer1
).load_from_checkpoint(
    "/content/drive/MyDrive/history/checkpoints/event_gen/epoch=1-step=14658.ckpt",
    bart=bart1,
    bart_tokenizer=bart_tokenizer1
)

arg_model = ArgumentModelWrapper(
    bart=bart2,
    bart_tokenizer=bart_tokenizer2
).load_from_checkpoint(
    "/content/drive/MyDrive/history/checkpoints/argument/epoch=1-step=14658.ckpt",
    bart=bart2,
    bart_tokenizer=bart_tokenizer2
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.w

In [9]:
paragraph_model.to("cuda")
evt_model.to("cuda")
arg_model.to("cuda")

ArgumentModelWrapper(
  (model): ArgumentModel(
    (transformer): BartModel(
      (shared): Embedding(50267, 768)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50267, 768)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0-5): 6 x BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
            (final_layer

In [10]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
texts, tags, labels = load_data(data, nlp)

In [11]:
text = texts[0]
print(text)

['Adolf', 'Hitler', '(', 'German', ':', '[', 'ˈaːdɔlf', 'ˈhɪtlɐ', ']', ';', '20', 'April', '1889', '\xa0', '–', '30', 'April', '1945', ')', 'was', 'an', 'Austrian', '-', 'born', 'German', 'politician', 'who', 'was', 'the', 'dictator', 'of', 'Germany', 'from', '1933', 'until', 'his', 'suicide', 'in', '1945', '.', 'He', 'rose', 'to', 'power', 'as', 'the', 'leader', 'of', 'the', 'Nazi', 'Party', ',', 'becoming', 'the', 'chancellor', 'in', '1933', 'and', 'then', 'taking', 'the', 'title', 'of', 'Führer', 'und', 'Reichskanzler', 'in', '1934', '.', 'During', 'his', 'dictatorship', ',', 'he', 'initiated', 'World', 'War', 'II', 'in', 'Europe', 'by', 'invading', 'Poland', 'on', '1', '\xa0', 'September', '1939', '.', 'He', 'was', 'closely', 'involved', 'in', 'military', 'operations', 'throughout', 'the', 'war', 'and', 'was', 'central', 'to', 'the', 'perpetration', 'of', 'the', 'Holocaust', ',', 'the', 'genocide', 'of', 'about', 'six', 'million', 'Jews', 'and', 'millions', 'of', 'other', 'victims'

In [12]:
text_encoding = paragraph_tokenizer(
    text,
    is_split_into_words=True,
    padding=True,
    truncation=True
)
text_encoding

{'input_ids': [101, 12670, 7579, 113, 1528, 131, 164, 384, 1161, 28306, 1181, 28276, 9654, 384, 1324, 28283, 26414, 28274, 166, 132, 1406, 1364, 5825, 782, 1476, 1364, 2481, 114, 1108, 1126, 5488, 118, 1255, 1528, 2931, 1150, 1108, 1103, 26400, 1104, 1860, 1121, 3698, 1235, 1117, 5680, 1107, 2481, 119, 1124, 3152, 1106, 1540, 1112, 1103, 2301, 1104, 1103, 5755, 1786, 117, 2479, 1103, 15046, 1107, 3698, 1105, 1173, 1781, 1103, 1641, 1104, 143, 24322, 5576, 14994, 9289, 19411, 2879, 1107, 3729, 119, 1507, 1117, 21737, 117, 1119, 7087, 1291, 1414, 1563, 1107, 1980, 1118, 19185, 2870, 1113, 122, 1347, 3061, 119, 1124, 1108, 4099, 2017, 1107, 1764, 2500, 2032, 1103, 1594, 1105, 1108, 2129, 1106, 1103, 1679, 12924, 6108, 1104, 1103, 12066, 117, 1103, 19643, 1104, 1164, 1565, 1550, 4384, 1105, 9215, 1104, 1168, 5256, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
input_ids = torch.tensor([text_encoding['input_ids']]).to("cuda")
attention_mask = torch.tensor([text_encoding['attention_mask']]).to("cuda")

In [14]:
result = paragraph_model.predict_paragraph_class(input_ids, attention_mask)
result

tensor([0.9361], device='cuda:0', grad_fn=<SelectBackward0>)

In [15]:
encodings, tokens_labels, labels, tag2idx, idx2tag = preprocess_data(
    texts.tolist(), tags.tolist(),
    labels.tolist(), paragraph_tokenizer, padding="max_length"
)

Adjusting tags to encodings: 0it [00:00, ?it/s]

In [16]:
print(encodings[0].ids)

[101, 12670, 7579, 113, 1528, 131, 164, 384, 1161, 28306, 1181, 28276, 9654, 384, 1324, 28283, 26414, 28274, 166, 132, 1406, 1364, 5825, 782, 1476, 1364, 2481, 114, 1108, 1126, 5488, 118, 1255, 1528, 2931, 1150, 1108, 1103, 26400, 1104, 1860, 1121, 3698, 1235, 1117, 5680, 1107, 2481, 119, 1124, 3152, 1106, 1540, 1112, 1103, 2301, 1104, 1103, 5755, 1786, 117, 2479, 1103, 15046, 1107, 3698, 1105, 1173, 1781, 1103, 1641, 1104, 143, 24322, 5576, 14994, 9289, 19411, 2879, 1107, 3729, 119, 1507, 1117, 21737, 117, 1119, 7087, 1291, 1414, 1563, 1107, 1980, 1118, 19185, 2870, 1113, 122, 1347, 3061, 119, 1124, 1108, 4099, 2017, 1107, 1764, 2500, 2032, 1103, 1594, 1105, 1108, 2129, 1106, 1103, 1679, 12924, 6108, 1104, 1103, 12066, 117, 1103, 19643, 1104, 1164, 1565, 1550, 4384, 1105, 9215, 1104, 1168, 5256, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
input_ids = torch.tensor([text_encoding['input_ids']]).to("cuda")
attention_mask = torch.tensor([text_encoding['attention_mask']]).to("cuda")

In [18]:
result = paragraph_model.predict_paragraph_class(input_ids, attention_mask)
result

tensor([0.9262], device='cuda:0', grad_fn=<SelectBackward0>)

In [20]:
evt_template_in = "This document is about <evt>"

res = {
    "results": []
}

for idx, paragraph in tqdm(
    enumerate(data["paragraphs"]),
    total=len(data["paragraphs"]),
    desc="Processing paragraph",
    leave=False
):
    paragraph_text = texts[idx]
    paragraph_encoding = paragraph_tokenizer(
        paragraph_text,
        is_split_into_words=True,
        padding=True,
        truncation=True
    )
    paragraph_input_ids = torch.tensor([paragraph_encoding['input_ids']]).to("cuda")
    paragraph_attention_mask = torch.tensor([paragraph_encoding['attention_mask']]).to("cuda")
    result = paragraph_model.predict_paragraph_class(paragraph_input_ids, paragraph_attention_mask)

    if result >= 0.5:
        text = paragraph["clean_content"]
        #print("text: ", text)

        context = evt_tokenizer.tokenize(
            text,
            add_prefix_space=True
        )

        if context == []:
            continue

        evt_in = evt_tokenizer.encode_plus(
            evt_template_in,
            context,
            add_special_tokens=True,
            add_prefix_space=True,
            max_length=424,
            truncation="only_second",
            padding="max_length"
        )

        evt_input_ids = torch.tensor([evt_in["input_ids"]]).to("cuda")

        evt_res = evt_model.model.generate(
            input_ids=evt_input_ids,
            do_sample=True,
            top_k=20,
            top_p=0.95,
            max_length=30,
            num_return_sequences=1,
            num_beams=1
        )

        predicted_evt_sent = evt_tokenizer.decode(
            evt_res[0],
            skip_special_tokens=True
        )

        re_combine_whitespace = re.compile(r"\s+")
        predicted_evt_sent = re_combine_whitespace.sub(" ", predicted_evt_sent).strip()
        predicted_evt_sent = re.sub("This document is about ", "", predicted_evt_sent)

        if predicted_evt_sent in sent2evt.keys():
            predicted_evt = sent2evt[predicted_evt_sent]
            template = ontology_dict[predicted_evt]["template"]
        else:
            print("No event found: ", predicted_evt_sent)
            continue

        template_in = template2tokens(template, evt_tokenizer)

        arg_in = evt_tokenizer.encode_plus(
            template_in,
            context,
            add_special_tokens=True,
            add_prefix_space=True,
            max_length=424,
            truncation="only_second",
            padding="max_length"
        )

        arg_input_ids = torch.tensor([arg_in["input_ids"]]).to("cuda")

        arg_res = arg_model.model.generate(
            input_ids=arg_input_ids,
            do_sample=True,
            top_k=20,
            top_p=0.95,
            max_length=30,
            num_return_sequences=1,
            num_beams=1
        )

        arg_sent = evt_tokenizer.decode(
            arg_res[0],
            skip_special_tokens=True
        )

        d = {
            "text": text,
            "predicted_evt": predicted_evt_sent,
            "predicted_args": arg_sent
        }

        res["results"].append(d)

        if idx % 100 == 0:
            with open(
                "/content/drive/MyDrive/history/results/wiki_dataset/predictions.json",
                "w",
                encoding="utf-8"
            ) as f:
                json.dump(res, f, indent=4)


with open(
    "/content/drive/MyDrive/history/results//wiki_dataset/predictions.json",
    "w",
    encoding="utf-8"
) as f:
    json.dump(res, f, indent=4)

Processing paragraph:   0%|          | 0/12876 [00:00<?, ?it/s]

No event found:  This person being hired
No event found:  This is about invasion in a conflict
No event found:  This is about meeting for a funeral or vigil
No event found:  This is about correspondence about a creation
No event found:  This is about a transfer of ownership
No event found:  This is about a purchase with a transfer of money
No event found:  This is about meeting for a negotiation
No event found:  This is about the building of an artifact
No event found:  This is about an injury
No event found:  This is about correspondence about a request or advise
No event found:  This is about yielding in a conflict
No event found:  This is about a person fallen during transportation
No event found:  This is about meeting for commitment or promise or intent expression
No event found:  This broadcast of a gift granted or aid provided in person
No event found:  This broadcast of a public statement in person
No event found:  This is about the broadcast of a judicial process
No event foun