In [1]:
from nlinec.data.load import get_positive_data
from nlinec.data.preprocessing import construct_hypothesis, get_type

In [2]:
data = get_positive_data("augmented_train.json")

793487it [00:10, 77172.11it/s] 


In [3]:
# Add the basic type
data['fixed_granularity_type'] = data['type'].apply(lambda x: get_type(x, 2))

# Remove the rows with type None or "other"
data = data[data['fixed_granularity_type'].notna()]
data = data[data['fixed_granularity_type'] != 'other']

# Remove duplicates
data = data.drop_duplicates(subset=['mention_span', 'sentence', 'fixed_granularity_type'])

# Construct the hypothesis
data["hypothesis"] = data.apply(lambda row: construct_hypothesis(row["mention_span"], row["fixed_granularity_type"]), axis=1)

# Mark all rows as entailed
data["label"] = 2

data

Unnamed: 0,type,mention_span,sentence,fixed_granularity_type,hypothesis,label
2,/location/country,We,We did not do anything at that time.,country,We is a country.,2
5,/other/scientific,antibody,`` We don't know the effect of our antibody on...,scientific,antibody is a scientific.,2
6,/location/city,Lisbon,The Visigoths of Spain were defeated when the ...,city,Lisbon is a city.,2
8,/other/product,non food crops or inedible waste products,Cellulosic ethanol production uses non food cr...,product,non food crops or inedible waste products is a...,2
10,/other/event/sports_event,traditional games,In caffeehouses around you could see people sm...,event,traditional games is a event.,2
...,...,...,...,...,...,...
1864991,/location/city,Baghdad,Journalists held a candle light vigil in Baghd...,city,Baghdad is a city.,2
1864993,/other/event/sports_event,The game,"The game features 25 songs by Van Halen, 3 gu...",event,The game is a event.,2
1864997,/other/health/treatment,transfer,It marked the first peaceful transfer of power...,health,transfer is a health.,2
1864999,/location/country,American,"Right now, the American populace is spending a...",country,American is a country.,2


In [4]:
possible_types = data['fixed_granularity_type'].unique()
possible_types

array(['country', 'scientific', 'city', 'product', 'event', 'art',
       'artist', 'political_figure', 'military', 'company', 'structure',
       'title', 'language', 'sports_team', 'music', 'coach',
       'living_thing', 'body_part', 'park', 'currency', 'geography',
       'health', 'political_party', 'government', 'supernatural',
       'transit', 'education', 'food', 'athlete', 'award',
       'religious_leader', 'religion', 'legal', 'internet', 'celestial',
       'heritage', 'doctor', 'sports_and_leisure', 'sports_league',
       'stock_exchange', 'geograpy'], dtype=object)

In [5]:
# Split the data into train and validation
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from nlinec.predict import predict_type

In [8]:
val_sample = val_data.sample(100).reset_index(drop=True)
val_sample

Unnamed: 0,type,mention_span,sentence,fixed_granularity_type,hypothesis,label
0,/location/structure/government,US,"Of course, the US military is going to jump at...",structure,US is a structure.,2
1,/location/structure,uncovered,"Ms. Rose, who teaches literature at Wesleyan U...",structure,uncovered is a structure.,2
2,/other/product,investment bankers,The turmoil on Wall Street may benefit some re...,product,investment bankers is a product.,2
3,/location/city,Liverpool,In 1699 Liverpool was made a parish by Act of ...,city,Liverpool is a city.,2
4,/organization/company,Peugeot,One of them splashed gasoline on the Peugeot .,company,Peugeot is a company.,2
...,...,...,...,...,...,...
95,/person/title,So,"`` So I tell you, don't worry about the things...",title,So is a title.,2
96,/location/city,London,One edition was published in London with Charl...,city,London is a city.,2
97,/person/artist/author,US journalist Helen Thomas,Noted US journalist Helen Thomas told a Hebrew...,artist,US journalist Helen Thomas is a artist.,2
98,/location/city,every city in the United States,Neon signage was adopted with increasing frequ...,city,every city in the United States is a city.,2


In [9]:
val_predictions = predict_type(model, tokenizer, val_sample['sentence'], val_sample['mention_span'], possible_types, return_str=True, verbose=True)

Predicting types: 100%|██████████| 100/100 [04:02<00:00,  2.43s/it]


In [10]:
val_sample['prediction'] = val_predictions
val_sample

Unnamed: 0,type,mention_span,sentence,fixed_granularity_type,hypothesis,label,prediction
0,/location/structure/government,US,"Of course, the US military is going to jump at...",structure,US is a structure.,2,military
1,/location/structure,uncovered,"Ms. Rose, who teaches literature at Wesleyan U...",structure,uncovered is a structure.,2,title
2,/other/product,investment bankers,The turmoil on Wall Street may benefit some re...,product,investment bankers is a product.,2,title
3,/location/city,Liverpool,In 1699 Liverpool was made a parish by Act of ...,city,Liverpool is a city.,2,legal
4,/organization/company,Peugeot,One of them splashed gasoline on the Peugeot .,company,Peugeot is a company.,2,product
...,...,...,...,...,...,...,...
95,/person/title,So,"`` So I tell you, don't worry about the things...",title,So is a title.,2,living_thing
96,/location/city,London,One edition was published in London with Charl...,city,London is a city.,2,title
97,/person/artist/author,US journalist Helen Thomas,Noted US journalist Helen Thomas told a Hebrew...,artist,US journalist Helen Thomas is a artist.,2,product
98,/location/city,every city in the United States,Neon signage was adopted with increasing frequ...,city,every city in the United States is a city.,2,city


In [11]:
print(f"Sample accuracy: {(val_sample['fixed_granularity_type'] == val_sample['prediction']).mean()}")

Sample accuracy: 0.22


In [6]:
# Make the data usable by the model
# The input is of the form: sentence</s><s>hypothesis

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True).loc[:, ["sentence", "hypothesis", "label"]])
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True).loc[:, ["sentence", "hypothesis", "label"]])

def tokenize_function(examples):
    input_text = examples["sentence"] + "</s><s>" + examples["hypothesis"]
    return tokenizer(input_text, max_length=model.config.max_position_embeddings, padding="max_length", return_tensors="pt")

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

                                                                      

In [8]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
    gradient_accumulation_steps=4,
    save_steps=1000,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

In [15]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, hypothesis. If sentence, hypothesis are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 264426
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 24789
  Number of trainable parameters = 355362819


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, hypothesis. If sentence, hypothesis are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66107
  Batch size = 8
***** Running Evaluation *****
  Num examples = 66107
  Batch size = 8
