In [1]:
from nlinec import get_positive_data, get_all_types, get_granularity, construct_hypothesis, get_type, combine_premise_hypothesis, get_models_dir, get_negative_data, combine_positive_negative_data
from nlinec.predict import predict_type
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GRANULARITY = 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
positive_data = get_positive_data("augmented_train.json", explode=True)
negative_data = get_negative_data("augmented_train.json", random_state=42)
dev_data = get_positive_data("g_dev.json", explode=True)

Loading augmented_train.json: 793487it [00:12, 64709.47it/s] 


Loading negative data from /home/psaegert/Projects/nli-nec/src/nlinec/../../data/derived/negative_data/augmented_train.json_42.csv...


Loading g_dev.json: 2202it [00:00, 216870.35it/s]


In [4]:
data = combine_positive_negative_data(positive_data, negative_data, frac=0.5, random_state=42)
del positive_data, negative_data

In [5]:
# Add the basic type
data[f'type_{GRANULARITY}'] = data['full_type'].apply(lambda x: get_type(x, GRANULARITY))
dev_data[f'type_{GRANULARITY}'] = dev_data['full_type'].apply(lambda x: get_type(x, GRANULARITY))

# Remove the rows with type None or "other"
data = data[data[f'type_{GRANULARITY}'].notna()]
dev_data = dev_data[dev_data[f'type_{GRANULARITY}'].notna()]
# data = data[data[f'type_{GRANULARITY}'] != 'other']

# Remove duplicates
data = data.drop_duplicates(subset=['mention_span', 'sentence', f'type_{GRANULARITY}'])
dev_data = dev_data.drop_duplicates(subset=['mention_span', 'sentence', f'type_{GRANULARITY}'])

# Construct the hypothesis
data["hypothesis"] = data.apply(lambda row: construct_hypothesis(row["mention_span"], row[f'type_{GRANULARITY}']), axis=1)
dev_data["hypothesis"] = dev_data.apply(lambda row: construct_hypothesis(row["mention_span"], row[f'type_{GRANULARITY}']), axis=1)

data

Unnamed: 0,full_type,mention_span,sentence,granularity,label,type_2,hypothesis
2,/location/country,We,We did not do anything at that time.,2,2,country,We is a country.
5,/organization/company,antibody,`` We don't know the effect of our antibody on...,2,1,company,antibody is a company.
6,/other/product,Lisbon,The Visigoths of Spain were defeated when the ...,2,1,product,Lisbon is a product.
8,/other/product,non food crops or inedible waste products,Cellulosic ethanol production uses non food cr...,2,2,product,non food crops or inedible waste products is a...
10,/location/geography/body_of_water,traditional games,In caffeehouses around you could see people sm...,3,1,geography,traditional games is a geography.
...,...,...,...,...,...,...,...
1864998,/person/artist,transfer,It marked the first peaceful transfer of power...,2,1,artist,transfer is a artist.
1864999,/other/internet,American,"Right now, the American populace is spending a...",2,1,internet,American is a internet.
1865001,/other/event/holiday,American,"Right now, the American populace is spending a...",3,1,event,American is a event.
1865002,/location/structure,American,"Right now, the American populace is spending a...",2,2,structure,American is a structure.


In [6]:
gran_types = []
for i in [1, 2, 3]:
    all_types = get_all_types(granularity=i)
    all_types['granularity'] = all_types['full_type'].apply(lambda x: get_granularity(x))
    gran_types.append(all_types[all_types['granularity'] == i])

In [7]:
from nlinec import get_models_dir

In [8]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to(DEVICE)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Make the data usable by the model
# The input is of the form: sentence</s><s>hypothesis

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
train_dataset = Dataset.from_pandas(data.loc[:, ["sentence", "hypothesis", "label"]])
dev_dataset = Dataset.from_pandas(dev_data.loc[:, ["sentence", "hypothesis", "label"]])

def tokenize_function(examples: dict) -> dict:
    # input_text = examples["sentence"] + "</s><s>" + examples["hypothesis"]
    input_text = [combine_premise_hypothesis(sentence, hypothesis) for sentence, hypothesis in zip(examples["sentence"], examples["hypothesis"])]
    return tokenizer(input_text, max_length=model.config.max_position_embeddings, padding="max_length", return_tensors="pt")

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)

                                                                      

In [10]:
output_dir = os.path.join(get_models_dir(), f'nlinec-{GRANULARITY}')

In [11]:
def compute_metrics(eval_pred):
    with torch.no_grad():
        predictions = predict_type(model, tokenizer, list(dev_data['sentence']), list(dev_data['mention_span']), list(gran_types[1]['type']), return_str=True, verbose=True)
    print((dev_data[f'type_{GRANULARITY}'] == predictions).mean())
    return {"accuracy": (dev_data[f'type_{GRANULARITY}'] == predictions).mean()}

In [12]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=4,
    save_steps=3000,
    # load_best_model_at_end=True,
    evaluation_strategy="steps",
    # evaluation_strategy='no',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis, sentence. If hypothesis, sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 813973
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 25436
  Number of trainable parameters = 355362819


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis, __index_level_0__, sentence. If hypothesis, __index_level_0__, sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 866
  Batch size = 8
***** Running Evaluation *****
  Num examples = 866
  Batch size = 8
Predicting types: 100%|█████████▉| 865/866 [00:48<00:00, 21.76it/s]

In [14]:
# Save the model
trainer.save_model(output_dir)

Saving model checkpoint to /home/psaegert/Projects/nli-nec/src/nlinec/../../models/nlinec-positive-2
Configuration saved in /home/psaegert/Projects/nli-nec/src/nlinec/../../models/nlinec-positive-2/config.json
Model weights saved in /home/psaegert/Projects/nli-nec/src/nlinec/../../models/nlinec-positive-2/pytorch_model.bin


In [15]:
with torch.no_grad():
    dev_predictions = predict_type(model, tokenizer, list(dev_data['sentence']), list(dev_data['mention_span']), list(gran_types[1]['type']), return_str=True, verbose=True)

Predicting types: 100%|██████████| 866/866 [00:50<00:00, 17.16it/s]


In [16]:
dev_data['prediction_after'] = dev_predictions
dev_data

Unnamed: 0,mention_span,full_type,sentence,granularity,label,type_2,hypothesis,prediction_before,prediction_after
3,Japan,/location/country,Japan's wholesale prices in September rose 3.3...,2,2,country,Japan is a country.,country,country
5,the Bank of Japan,/location/structure,Japan's wholesale prices in September rose 3.3...,2,2,structure,the Bank of Japan is a structure.,title,company
7,the Bank of Japan,/organization/government,Japan's wholesale prices in September rose 3.3...,2,2,government,the Bank of Japan is a government.,title,company
15,the Bank,/location/structure,Japan's wholesale prices in September rose 3.3...,2,2,structure,the Bank is a structure.,event,company
18,Japan 's,/location/country,Japan 's wholesale prices in September rose 3...,2,2,country,Japan 's is a country.,product,country
...,...,...,...,...,...,...,...,...,...
3264,Europe,/location/geography,There were no major Eurobond or foreign bond o...,2,2,geography,Europe is a geography.,title,country
3267,$ 150 million,/other/currency,"$ 150 million of 9 % debentures due Oct. 15, ...",2,2,currency,$ 150 million is a currency.,product,currency
3275,the Treasury 's,/organization/government,"The non-callable issue, which can be put back ...",2,2,government,the Treasury 's is a government.,product,company
3278,the Treasury 's,/organization/government,"The issue, which is puttable back to the compa...",2,2,government,the Treasury 's is a government.,product,government


In [17]:
print(f"Sample accuracy after training: {(dev_data[f'type_{GRANULARITY}'] == dev_data['prediction_after']).mean()}")

Sample accuracy after training: 0.5958429561200924
