In [1]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=False)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
pnli_path = "/content/drive/MyDrive/precondition-inference"
import sys
sys.path.append(pnli_path)

In [3]:
%cd /content/drive/MyDrive/precondition-inference

/content/drive/MyDrive/precondition-inference


In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [6]:
!pip install -q transformers
!pip install -q datasets

In [7]:
import csv
from dataclasses import dataclass
from pathlib import Path
from typing import (Union, Tuple, List, Sequence)

import numpy as np
import torch

In [8]:
## Helper functions

class PNLIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self,):
        return len(self.encodings["input_ids"])

    def __getitem__(self, i):
        out = {key: val[i] for key, val in self.encodings.items()}
        if self.labels is not None:
            out["label"] = torch.tensor(self.labels[i])

        return out


def read_data(
    *files: Union[Path, str],
) -> List[Tuple[str, str, int]]:
    for file in files:
        with open(file, mode='r') as f:
            reader = csv.reader(f)
            data = list(tuple(line) for line in reader)

    return data

def accuracy_score(true: Sequence, pred: Sequence) -> float:
    score = np.sum(true == pred) / len(true)

    return score

def accuracy_score_logits(logits: Sequence, true: Sequence) -> float:
    pred = np.argmax(logits, axis=-1)
    score = accuracy_score(true, pred)

    return score

def seqs_to_batch(data_seq: Sequence) -> Tuple[List[str], List[int]]:
    out = [" ".join([data[0], data[1]]) for data in data_seq]
    labels = [int(data[2]) for data in data_seq]

    return out, labels

def test_to_batch(data_seq: Sequence) -> List[str]:
    out = [" ".join([data[0], data[1]]) for data in data_seq]

    return out

In [9]:
from datasets import load_metric

def accuracy_metric(eval_pred):
    metric = load_metric("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [10]:
train_path = Path("./data/pnli_train.csv")
dev_path = Path("./data/pnli_dev.csv")
test_path = Path("./data/pnli_test_unlabeled.csv")

In [11]:
train = read_data(train_path)
dev = read_data(dev_path)

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [12]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

In [13]:
traindata, train_labels = seqs_to_batch(train)
devdata, dev_labels = seqs_to_batch(dev)

In [14]:
model_name = "roberta-large-mnli"

In [15]:
# del model
# del tokenizer

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.train()

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [17]:
train_encodings = tokenizer(traindata, truncation=False, padding=True, return_tensors="pt")
dev_encodings = tokenizer(devdata, truncation=False, padding=True, return_tensors="pt")

In [18]:
train_dataset = PNLIDataset(train_encodings, train_labels)
dev_dataset = PNLIDataset(dev_encodings, dev_labels)

In [19]:
training_args = TrainingArguments(
    output_dir="./results/",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    warmup_steps=500,
    logging_dir="./logs/",
    logging_steps=100,
    evaluation_strategy="epoch"
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=accuracy_metric,
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 5983
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 282


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.283496,0.890995
2,0.589700,0.242519,0.909953
3,0.254500,0.27063,0.906161


***** Running Evaluation *****
  Num examples = 1055
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1055
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1055
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=282, training_loss=0.35324012134092075, metrics={'train_runtime': 227.3197, 'train_samples_per_second': 78.959, 'train_steps_per_second': 1.241, 'total_flos': 1176137757997416.0, 'train_loss': 0.35324012134092075, 'epoch': 3.0})

In [22]:
model.eval()
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1055
  Batch size = 64


{'epoch': 3.0,
 'eval_accuracy': 0.9061611374407583,
 'eval_loss': 0.2706303894519806,
 'eval_runtime': 4.1187,
 'eval_samples_per_second': 256.15,
 'eval_steps_per_second': 4.128}

In [23]:
out = trainer.predict(dev_dataset)[0]
out

***** Running Prediction *****
  Num examples = 1055
  Batch size = 64


array([[-0.11649586,  4.463805  , -3.6029274 ],
       [ 5.13441   , -1.152449  , -3.669015  ],
       [ 0.80755234,  3.811908  , -4.0582666 ],
       ...,
       [ 0.12247121,  4.3830953 , -3.9605799 ],
       [ 0.21812204,  4.2825413 , -3.8725262 ],
       [ 0.6185171 ,  4.0629454 , -4.1902103 ]], dtype=float32)

In [25]:
pred = np.argmax(out, axis=-1)
accuracy_score(pred, dev_labels)

0.9061611374407583

In [19]:
import copy

In [20]:
full_train = copy.copy(train)
full_train.extend(dev)
test = read_data(test_path)

del model
del tokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.train()

full_train_data, full_train_labels = seqs_to_batch(full_train)
test_data = test_to_batch(test)

full_encoding = tokenizer(
    full_train_data, truncation=False, padding=True, return_tensors="pt"
)
test_encoding = tokenizer(
    test_data, truncation=False, padding=True, return_tensors="pt"
)

full_dataset = PNLIDataset(full_encoding, full_train_labels)
test_dataset = PNLIDataset(test_encoding)

training_args = TrainingArguments(
    output_dir="./results/",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    warmup_steps=500,
    logging_dir="./logs/",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=accuracy_metric,
)

trainer.train()

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running training *****
  Num examples = 5983
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 282


Step,Training Loss
100,0.5897
200,0.2545




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=282, training_loss=0.35324012134092075, metrics={'train_runtime': 214.3636, 'train_samples_per_second': 83.732, 'train_steps_per_second': 1.316, 'total_flos': 1176137757997416.0, 'train_loss': 0.35324012134092075, 'epoch': 3.0})

In [21]:
out = trainer.predict(test_dataset)[0]
results = np.argmax(out, axis=-1)

***** Running Prediction *****
  Num examples = 4850
  Batch size = 64


In [26]:
print(len(results))
results[:4]

4850


[1, 1, 0, 0]

In [None]:
%tensorboard --logdir logs

### Output Prediction Result File

You will need to submit a prediction result file. It should have 4850 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [23]:
# suppose you had your model's predictions on the 4850 test cases read from test_enc_unlabeled.tsv, and 
# those results are in the list called 'results'
assert (len(results) == 4850)

In [24]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [25]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')