In [1]:
import os
import json
import pandas as pd
import numpy as np
import re
import random

from tqdm.auto import tqdm

from constants import *

In [2]:
I2B2_PATH = os.path.join(ROOT_PATH, "data", "i2b2_n2c2")

In [3]:
completed_sentences = []
labels = []

for filename in os.listdir(os.path.join(I2B2_PATH, "train", "partners", "rel")):
    
    filename = filename.split(".")[0]
    
    with open(os.path.join(I2B2_PATH, "train", "partners", "rel", filename + ".rel"), "r") as f:
        rel = f.read().split("\n")

    with open(os.path.join(I2B2_PATH, "train", "partners", "txt", filename + ".txt"), "r") as f:
        lines = f.read().split("\n")

    for single_rel in rel:
    
        if len(single_rel) == 0:
            continue
    
        first_word = re.search(r'c="(.+)"', single_rel.split("||")[0])[1]
        first_span = re.findall(r'[0-9]+:[0-9]+', single_rel.split("||")[0])

        first_line_num1 = int(first_span[0].split(":")[0]) - 1
        first_line_num2 = int(first_span[1].split(":")[0]) - 1

        second_word = re.search(r'c="(.+)"', single_rel.split("||")[-1])[1]
        second_span = re.findall(r'[0-9]+:[0-9]+', single_rel.split("||")[-1])

        second_line_num1 = int(second_span[0].split(":")[0]) - 1
        second_line_num2 = int(second_span[1].split(":")[0]) - 1

        r_type = re.search(r'r="(.+?)"', single_rel)[1]
    
        needed_lines = sorted(set([first_line_num1, first_line_num2, second_line_num1, second_line_num2]))
    
        text_completed = ""
        for line_id in needed_lines:
            text_completed += lines[line_id] + "\n"
        text_completed = text_completed.strip()

        text_completed = re.sub(first_word, f"<e1>{first_word}</e1>", text_completed, flags=re.I)
        text_completed = re.sub(second_word, f"<e2>{second_word}</e2>", text_completed, flags=re.I)
    
        text_completed = "[CLS] " + text_completed.strip() + " [SEP]"
    
        completed_sentences.append(text_completed)
        labels.append(r_type)

In [6]:
label2id = {}
cnt = 1
for i in list(set(labels)):
    label2id[i] = cnt
    cnt += 1

label_ids = []
for l in labels:
    label_ids.append(label2id[l])

In [8]:
sentences_full_list = []
labels_full_list = []

with open(os.path.join(ROOT_PATH, "data", "2021AB_SN", "SRSTRE1")) as f:
    umls_relations_df = pd.read_csv(f, delimiter='|', names=["FirstTUI", "RelationTUI", "EndTUI"], index_col=False)

with open(os.path.join(ROOT_PATH, "data", "2021AB_SN", "SRDEF"), "r") as f:
    def_df = pd.read_csv(f, delimiter="|", header=None)
relation_def_df = def_df.loc[def_df[0] == "RL"].reset_index(drop=True)

for filename in tqdm(os.listdir(RELATIONS_PATH)[:50]):

    with open(os.path.join(DATA_CLEAN_PATH, filename.split("_")[0] + ".txt"), "r") as f:
        text = f.read()

    with open(os.path.join(ENTITIES_PATH, filename.split("_")[0] + ".csv"), "r") as f:
        entities_df = pd.read_csv(f).drop("Unnamed: 0", axis=1)

    with open(os.path.join(RELATIONS_PATH, filename), "r") as f:
        relations_df = pd.read_csv(f).drop("Unnamed: 0", axis=1)

    sentences = []

    cursor = 0
    last_end = 0

    for word, start_char, end_char in list(entities_df[["Word", "StartChar", "EndChar"]].itertuples(index=False, name=None)):
        dot = re.search("\.", text[cursor:start_char])
        if dot:
            sentences.append({"Text": text[last_end:cursor + dot.span()[0]],
                              "StartChar": last_end,
                              "EndChar": cursor + dot.span()[0]})
            last_end = cursor + dot.span()[1]
        else:
            pass
        cursor = end_char
    sentences_df = pd.DataFrame(sentences)

    for first_id, second_id, first_word, second_word, first_tui, second_tui in list(relations_df[["First", "End", "FirstWord", "EndWord", "FirstTUI", "EndTUI"]].itertuples(index=False, name=None)):
        sent_id = entities_df.iloc[first_id]["Sentence"]
        if sent_id not in sentences_df.index:
            continue
        sent_text = sentences_df.iloc[sent_id]["Text"]
        sent_start = sentences_df.iloc[sent_id]["StartChar"]
        sent_end = sentences_df.iloc[sent_id]["EndChar"]

        first_start_char = entities_df.iloc[first_id]["StartChar"]
        first_end_char = entities_df.iloc[first_id]["EndChar"]
        second_start_char = entities_df.iloc[second_id]["StartChar"]
        second_end_char = entities_df.iloc[second_id]["EndChar"]

        sentence_full = "[CLS] " + sent_text[:first_start_char - sent_start].strip() + \
                        " <e1>" + str(first_word) + "</e1>" + \
                        sent_text[first_end_char - sent_start:second_start_char - sent_start] + \
                        "<e2>" + str(second_word) + "</e2> " + \
                        sent_text[second_end_char - sent_start:].strip() + " [SEP]"
        sentences_full_list.append(sentence_full)

        if first_tui and second_tui:
            possible_labels = umls_relations_df["RelationTUI"].loc[umls_relations_df["FirstTUI"]
                                                                   == first_tui].loc[umls_relations_df["EndTUI"] == second_tui]
            if len(possible_labels) > 0:
                label_TUI = random.choice(possible_labels.values)
                label = int(relation_def_df.loc[relation_def_df[1] == label_TUI].index[0] + 1)
            else:
                label = 0
        else:
            label = 0
        labels_full_list.append(label)

  0%|          | 0/50 [00:00<?, ?it/s]

In [20]:
NUM_SUPERVISED = 1000
NUM_UNSUPERVISED = 4000

list_sentences = []
list_labels = []

supervised_samples = random.sample(list(enumerate(completed_sentences)), NUM_SUPERVISED)
for i, s in supervised_samples:
    list_sentences.append(s)
    list_labels.append(label_ids[i])
    
unsupervised_samples = random.sample(list(enumerate(sentences_full_list)), NUM_UNSUPERVISED)
for i, s in unsupervised_samples:
    list_sentences.append(s)
    list_labels.append(0)

In [23]:
import json

json_object_sentences = json.dumps(list_sentences)
json_object_labels = json.dumps(list_labels)

with open("train_sentence.json", "w") as f:
    f.write(json_object_sentences)

with open("train_label_id.json", "w") as f:
    f.write(json_object_labels)

In [24]:
import torch
from torch import nn
from transformers import BertPreTrainedModel, BertModel, BertForSequenceClassification
from torch.nn import CrossEntropyLoss, MSELoss
import math


class BertForSequenceClassificationUserDefined(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(2 * config.hidden_size, config.hidden_size)
        self.classifier_2 = nn.Linear(config.hidden_size, self.config.num_labels)
        self.init_weights()
        self.output_emebedding = None

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                head_mask=None, inputs_embeds=None, labels=None, e1_pos=None, e2_pos=None, w=None):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )  # sequence_output, pooled_output, (hidden_states), (attentions)

        e_pos_outputs = []
        sequence_output = outputs[0]
        for i in range(0, len(e1_pos)):
            e1_pos_output_i = sequence_output[i, e1_pos[i].item(), :]
            e2_pos_output_i = sequence_output[i, e2_pos[i].item(), :]
            e_pos_output_i = torch.cat((e1_pos_output_i, e2_pos_output_i), dim=0)
            e_pos_outputs.append(e_pos_output_i)
        e_pos_output = torch.stack(e_pos_outputs)
        self.output_emebedding = e_pos_output  # e1&e2 cancat output

        e_pos_output = self.dropout(e_pos_output)
        hidden = self.classifier(e_pos_output)
        logits = self.classifier_2(hidden)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = 0
                for i in range(len(w)):
                    loss += math.exp(w[i] - 1) * loss_fct(logits[i].view(-1, self.num_labels), labels[i].view(-1))
                    #loss += w[i] * loss_fct(logits[i].view(-1, self.num_labels), labels[i].view(-1))
                loss = loss / len(w)
            outputs = (loss, ) + outputs + (self.output_emebedding,)

        return outputs  # (loss), logits, (hidden_states), (attentions), (self.output_emebedding)


# f_theta1
class RelationClassification(BertForSequenceClassificationUserDefined):
    def __init__(self, config):
        super().__init__(config)


# g_theta2
class LabelGeneration(BertForSequenceClassificationUserDefined):
    def __init__(self, config):
        super().__init__(config)

In [62]:
NUM_LABELS = 9  # TACRED:42, SemEval:19, AL: 49, i2b2: 9 (as of yet)
CUDA = 0

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(CUDA))
    os.environ['CUDA_VISIBLE_DEVICES'] = str(CUDA)
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

modelf1 = RelationClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=NUM_LABELS,  # The number of output labels--2 for binary classification.
    # You can increase this for multi-class tasks.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)

modelf1 = nn.DataParallel(modelf1)
modelf1.to(device)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1660 Ti


Some weights of the model checkpoint at bert-base-uncased were not used when initializing RelationClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing RelationClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RelationClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RelationClassification were not initialized from the model checkpoint at bert-base-uncased and are n

DataParallel(
  (module): RelationClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
    

In [65]:
modelf1.load_state_dict(torch.load(os.path.join(ROOT_PATH, "data", "checkpoint.pt"))["model_state_dict"])

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 5.81 GiB total capacity; 4.51 GiB already allocated; 12.06 MiB free; 4.87 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF