In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import logging as log
log.basicConfig(level=log.DEBUG)

In [2]:
import sys
sys.path.append('../')
from baselines.utils import *
import os

In [3]:
import torch

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
cfg['model_name'] = 'distilbert-base-uncased'
cfg['batch_size'] = 32
cfg

{'DIR': '../dataset/',
 'train_path': 'train.json',
 'test_path': 'test.json',
 'dev_path': 'dev.json',
 'model_name': 'distilbert-base-uncased',
 'max_length': 512,
 'models_save_dir': '/scratch/shu7bh/contract_nli/models',
 'dataset_dir': '/scratch/shu7bh/contract_nli/dataset',
 'batch_size': 32}

In [5]:
# create dir if not exists
from pathlib import Path
Path(cfg["models_save_dir"]).mkdir(parents=True, exist_ok=True)
Path(cfg["dataset_dir"]).mkdir(parents=True, exist_ok=True)

In [6]:
# tokenizer = AutoTokenizer.from_pretrained(cfg['model_name'], use_fast=True)
# bert = AutoModelForMaskedLM.from_pretrained(cfg['model_name'])

# tokenizer.save_pretrained(cfg['models_save_dir'])
# bert.save_pretrained(cfg['models_save_dir'])

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg['models_save_dir'], use_fast=True)
bert = AutoModelForMaskedLM.from_pretrained(cfg['models_save_dir'])

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp4dikyt7h
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp4dikyt7h/_remote_module_non_scriptable.py


In [10]:
from icecream import ic

In [11]:
from torch.utils.data import Dataset
import torch

class NLIDataset(Dataset):
    def __init__(self, data, tokenizer, hypothesis, max_len = 475):
        self.data = data
        self.tokenizer = tokenizer
        self.hypothesis = hypothesis
        self.max_len = max_len
        self.spans = []
        self.label_dict = get_labels()

        for i, doc in enumerate(self.data['documents']):
            for j, span in enumerate(doc['spans']):
                self.spans.append({
                    'doc_id': i,
                    'span_id': j,
                    'text': doc['text'][span[0]:span[1]]
                })

        self.data_points = []
        self.span_label = []
        self.nli_label = []

        for key, val in self.hypothesis.items():
            for span in self.spans:
                self.span_label.append(int(span['span_id'] in self.data['documents'][span['doc_id']]['annotation_sets'][0]['annotations'][key]['spans']))

                self.data_points.append({ 'hypotheis': val, 'premise': span['text'] })

                self.nli_label.append(self.label_dict[self.data['documents'][span['doc_id']]['annotation_sets'][0]['annotations'][key]['choice']])

                if self.nli_label[-1] != self.label_dict['NotMentioned'] and self.span_label[-1] == 0:
                    self.nli_label[-1] = self.label_dict['Ignore']


        self.tokenized_data = self.tokenizer(
            [data_point['hypotheis'] for data_point in self.data_points],
            [data_point['premise'] for data_point in self.data_points],
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        self.sep_indices = torch.where(self.tokenized_data['input_ids'] == self.tokenizer.sep_token_id)[1]

        self.sep_indices = self.sep_indices[::2]

    def __len__(self):
        return len(self.data_points)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_data['input_ids'][idx],
            'attention_mask': self.tokenized_data['attention_mask'][idx],
            'token_type_ids': self.tokenized_data['token_type_ids'][idx],
            'span_label': torch.tensor(self.span_label[idx], dtype=torch.long),
            'nli_label': torch.tensor(self.nli_label[idx], dtype=torch.long),
            'sep_indices': self.sep_indices[idx]
        }
            # self.tokenized_data['input_ids'][idx], \
            # self.tokenized_data['attention_mask'][idx], \
            # self.tokenized_data['token_type_ids'][idx], \
            # torch.tensor(self.span_label[idx], dtype=torch.long), \
            # torch.tensor(self.nli_label[idx], dtype=torch.long), \
            # self.sep_indices[idx]

In [12]:
train_data = load_data(os.path.join(cfg['DIR'], cfg['train_path']))
dev_data = load_data(os.path.join(cfg['DIR'], cfg['dev_path']))
test_data = load_data(os.path.join(cfg['DIR'], cfg['test_path']))

hypothesis = get_hypothesis(train_data)

train_dataset = NLIDataset(train_data, tokenizer, hypothesis)
dev_dataset = NLIDataset(dev_data, tokenizer, hypothesis)
test_dataset = NLIDataset(test_data, tokenizer, hypothesis)

# save the datasets
torch.save(train_dataset, os.path.join(cfg['dataset_dir'], 'train_dataset.pt'))
torch.save(dev_dataset, os.path.join(cfg['dataset_dir'], 'dev_dataset.pt'))
torch.save(test_dataset, os.path.join(cfg['dataset_dir'], 'test_dataset.pt'))

In [14]:
# load the datasets
# train_dataset = torch.load(os.path.join(cfg['dataset_dir'], 'train_dataset.pt'))
# dev_dataset = torch.load(os.path.join(cfg['dataset_dir'], 'dev_dataset.pt'))
# test_dataset = torch.load(os.path.join(cfg['dataset_dir'], 'test_dataset.pt'))

In [15]:
train_dataset[0]

(tensor([  101,  4909,  2283,  4618,  2025,  7901,  3992,  2151,  5200,  2029,
          7861, 23684,  5860, 10483,  2075,  2283,  1005,  1055, 18777,  2592,
          1012,   102,  2512,  1011, 19380,  1998, 18777,  3012,  3820,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [16]:
from tqdm import tqdm
import numpy as np

In [17]:
from torch import nn
# Create contract nli class for huggingface trainer which uses bert model and adds two heads on top of it
class ContractNLI(nn.Module):
    def __init__(self, bert, num_labels, ignore_index):
        super().__init__()
        self.bert = bert
        self.embedding_dim = self.bert.config.hidden_size
        self.num_labels = num_labels
        self.labmda = 1
        self.nli_criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)
        self.span_criterion = nn.BCELoss()

        self.span_classifier = nn.Sequential(
            nn.Linear(self.embedding_dim, self.embedding_dim // 2),
            nn.ReLU(),
            nn.Linear(self.embedding_dim // 2, 1),
            nn.Sigmoid()
        )
        self.nli_classifier = nn.Sequential(
            nn.Linear(self.embedding_dim, self.embedding_dim // 2),
            nn.ReLU(),
            nn.Linear(self.embedding_dim // 2, self.num_labels)
        )

    def forward(self, input_ids, attention_mask, token_type_ids, sep_indices):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)[-1]

        span_logits = self.span_classifier(torch.gather(outputs, 1, sep_indices.unsqueeze(1).expand(-1, outputs.shape[-1]).unsqueeze(1)).squeeze(1))

        nli_logits = self.nli_classifier(outputs[:, 0, :])

        return span_logits, nli_logits

In [18]:
from transformers import Trainer

class ContractNLITrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        nli_label = inputs.pop('nli_label')
        span_label = inputs.pop('span_label')

        outputs = model(**inputs)
        span_logits, nli_logits = outputs[0], outputs[1]

        span_loss = self.model.span_criterion(span_logits.view(-1), span_label.float())
        nli_loss = self.model.nli_criterion(nli_logits, nli_label)

        loss = span_loss + self.model.labmda * nli_loss

        return (loss, outputs) if return_outputs else loss

In [19]:
import wandb

wandb.init(project="contract-nli", entity="contract-nli-db")

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home2/shu7bh/Courses/ANLP/Project/Contract-NLI/source_code, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home2/shu7bh/Courses/ANLP/Project/Contract-NLI/source_code, universal_newlines=False, shell=None, istream=None)
DEBUG:wandb.docker.auth:Trying paths: ['/home2/shu7bh/.docker/config.json', '/home2/shu7bh/.dockercfg']
DEBUG:wandb.docker.auth:No config file found
DEBUG:sentry_sdk.errors:[Tracing] Create new propagation context: {'trace_id': 'a44804a7fd864a69819cbb5c6193d7f5', 'span_id': 'a11d290d619541c7', 'parent_span_id': None, 'dynamic_sampling_context': None}
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.wandb.ai:443
DEBUG:urllib3.connectionpool:https://api.wandb.ai:443 "POST /graphql HTTP/1.1" 200 1704
DEB

In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=cfg['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=cfg['batch_size'],   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100,
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    fp16=True,
    learning_rate=1e-3,
    dataloader_num_workers=4,
    run_name='1',
    report_to='wandb'    
)

In [21]:
trainer = ContractNLITrainer(
    model=ContractNLI(bert, len(get_labels()), ignore_index=get_labels()['Ignore']).to(DEVICE),
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset             # evaluation dataset
)

In [22]:
trainer.train()

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/transformers/trainer_utils.py", line 707, in __call__
    return self.data_collator(features)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/transformers/data/data_collator.py", line 70, in default_data_collator
    return torch_default_data_collator(features)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/transformers/data/data_collator.py", line 109, in torch_default_data_collator
    features = [vars(f) for f in features]
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home2/shu7bh/miniforge3/envs/nli/lib/python3.11/site-packages/transformers/data/data_collator.py", line 109, in <listcomp>
    features = [vars(f) for f in features]
                ^^^^^^^
TypeError: vars() argument must have __dict__ attribute
