In [1]:
import os
import pandas as pd
import torch
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nlinec.data.load import get_positive_data, get_all_types
from nlinec.predict import predict_probabilities
from nlinec.utils import get_results_dir

## Setup

In [3]:
# Specify the dataset to predict and a file to save the predictions to
DATASET = 'augmented_train.json'
SAVE_PREDICTIONS_TO = os.path.join(get_results_dir(), "predictions", "zero-shot", "augmented_train_predictions_ho.csv")

# Specify the parameters for the prediction
MODEL = "roberta-large-mnli"
HYPOTHESIS_ONLY = True
SAVE_EVERY = 100_000

# Use the GPU if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Make sure the directory to save the predictions to exists
os.makedirs(os.path.dirname(SAVE_PREDICTIONS_TO), exist_ok=True)

## Load models & data

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(DEVICE)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Make entailment predictions for all types and filter out the relevant ones later in the analysis
all_types = get_all_types(granularity=-1)
all_types

Unnamed: 0,full_type,type
0,/other,other
1,/other/body_part,body_part
2,/person/title,title
3,/person,person
4,/person/athlete,athlete
...,...,...
84,/organization/stock_exchange,stock_exchange
85,/location/structure/hotel,hotel
86,/location/transit/bridge,bridge
87,/location/transit/railway,railway


In [6]:
dev_data = get_positive_data(DATASET)
dev_data

793487it [00:13, 59183.44it/s] 


Unnamed: 0,full_type,mention_span,sentence
0,[/other],the history and science of aviation and spacef...,"Located in Washington, D.C., United States, it..."
1,[/other],his debut,"The seventh child of the Jackson family, he ma..."
2,"[/location/country, /location]",We,We did not do anything at that time.
3,"[/other, /other/scientific]",antibody,`` We don't know the effect of our antibody on...
4,"[/location/city, /location]",Lisbon,The Visigoths of Spain were defeated when the ...
...,...,...,...
793482,"[/other/event/sports_event, /other, /other/event]",The game,"The game features 25 songs by Van Halen, 3 gu..."
793483,"[/other, /other/health/treatment, /other/health]",transfer,It marked the first peaceful transfer of power...
793484,"[/location/country, /location, /location/struc...",American,"Right now, the American populace is spending a..."
793485,"[/other/art/film, /other, /other/art]",his films,"As a boy, he donated his Bar Mitzvah money to ..."


In [7]:
# If some predictions already exist, load them
if os.path.exists(SAVE_PREDICTIONS_TO):
    # Load the predictions from file
    print("Loading predictions from file")
    predictions_df = pd.read_csv(SAVE_PREDICTIONS_TO, index_col=0)
else:
    # Create a dataframe with the same index as the data
    predictions_df = pd.DataFrame(columns=list(all_types['full_type']), index=dev_data.index)

Loading predictions from file


In [8]:
predictions_df

Unnamed: 0,/other,/other/body_part,/person/title,/person,/person/athlete,/other/art,/other/art/music,/other/event,/other/event/holiday,/other/religion,...,/other/award,/person/coach,/other/language/programming_language,/other/product/computer,/other/event/sports_event,/organization/stock_exchange,/location/structure/hotel,/location/transit/bridge,/location/transit/railway,/other/product/mobile_phone
0,0.297377,0.559533,0.489402,0.451403,0.339446,0.304558,0.298303,0.383638,0.236006,0.194973,...,0.190824,0.168705,0.152497,0.111945,0.129105,0.274042,0.178696,0.498410,0.145678,0.167308
1,0.325321,0.505012,0.524478,0.672370,0.384948,0.370712,0.460797,0.484568,0.240758,0.314419,...,0.314922,0.412211,0.447394,0.440228,0.388163,0.409601,0.468178,0.390132,0.400595,0.524732
2,0.574099,0.637628,0.561472,0.757327,0.395174,0.469581,0.414210,0.559988,0.411841,0.514505,...,0.315747,0.257829,0.390015,0.514119,0.268936,0.312091,0.432095,0.373146,0.388750,0.398304
3,0.506737,0.570681,0.557212,0.697848,0.235979,0.414709,0.496900,0.472794,0.309136,0.337512,...,0.319197,0.185723,0.417526,0.447633,0.413721,0.326360,0.389965,0.444946,0.383983,0.501673
4,0.437963,0.612638,0.483963,0.831401,0.222693,0.441578,0.514874,0.554305,0.383985,0.489900,...,0.361034,0.250301,0.425066,0.456192,0.391034,0.300901,0.459137,0.512782,0.412230,0.496378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793482,,,,,,,,,,,...,,,,,,,,,,
793483,,,,,,,,,,,...,,,,,,,,,,
793484,,,,,,,,,,,...,,,,,,,,,,
793485,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Find out which predictions still need to be made
todo = predictions_df.isna().any(axis=1)
(~todo).mean()

0.176436412946904

## Predict

In [10]:
# Make predictions for the remaining rows
with torch.no_grad():  # Disable gradient calculation for speed
    # Keep track of how many predictions have been made since the last save
    new_predictions_counter = 0

    # Iterate over all rows in the dev data
    for row in tqdm(dev_data.loc[todo, :].itertuples(), total=todo.sum()):

        # Predict the type of the mention and store the prediction
        entailment_probabilities = predict_probabilities(
            model,
            tokenizer,
            row.sentence,
            row.mention_span,
            all_types['type'],
            hypothesis_only=HYPOTHESIS_ONLY)[0, :, -1]  # -1 is the entailment class

        # Store the prediction
        predictions_df.loc[row.Index, :] = entailment_probabilities

        # Save the predictions to file every SAVE_EVERY predictions
        new_predictions_counter += 1
        if new_predictions_counter >= SAVE_EVERY:
            
            # Save the predictions to file
            predictions_df.to_csv(SAVE_PREDICTIONS_TO)
            new_predictions_counter = 0

# Save the remaining predictions to file
predictions_df.to_csv(SAVE_PREDICTIONS_TO)

100%|██████████| 653487/653487 [6:11:51<00:00, 29.29it/s]    
