In [1]:
import os
import json
import random
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split
from transformers import AutoTokenizer, RobertaTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import logging
import numpy as np
import time
import datetime
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
def shift_tokens_right(input_ids, pad_token_id):
    shifted_input_ids = input_ids.new_zeros(input_ids.size())
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = pad_token_id
    return shifted_input_ids

In [3]:
class MultiClassClassifier(nn.Module):
    def __init__(self, bert_model_path, labels_count, hidden_dim=768, mlp_dim=500, extras_dim=100, dropout=0.1, freeze_bert=False):
        super().__init__()

        self.bart = AutoModel.from_pretrained(bert_model_path,output_hidden_states=True,output_attentions=True)
        self.dropout = nn.Dropout(dropout)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, labels_count)
        )
        if freeze_bert:
            print("Freezing layers")
            for param in self.bart.parameters():
                param.requires_grad = False

    def forward(self, tokens, masks, decoder_ids=None):
        output = self.bart(tokens, attention_mask=masks)
        hidden_state = output.last_hidden_state
        pooled_output = hidden_state.mean(dim=1)
        dropout_output = self.dropout(pooled_output)
        mlp_output = self.mlp(dropout_output)

        return mlp_output

In [4]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [5]:
def get_features(data):
  features = []
  for _, fact in enumerate(data):
    claim = fact["claim"]


    feature = "[Claim]:"+claim+"[Evidences]:"+fact["doc"]
    features.append(feature)
  return features

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [6]:
logging.basicConfig(level=logging.ERROR)
if torch.cuda.is_available():

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


print('Loading bart tokenizer...')

tokenizer = AutoTokenizer.from_pretrained("AnReu/math_pretrained_roberta")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA L4
Loading bart tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/9.95k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
with open("train_claims_quantemp.json") as f:
    train_data = json.load(f)

with open("val_claims_quantemp.json") as f:
    val_data = json.load(f)

In [8]:
LE = LabelEncoder()
train_features = get_features(train_data)
val_features = get_features(val_data)

train_labels = [fact["label"] for fact in train_data]
val_labels = [fact["label"] for fact in val_data]
train_labels_final = LE.fit_transform(train_labels)
val_labels_final = LE.transform(val_labels)

In [9]:
input_ids = []
attention_masks = []

for sent in train_features:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

val_input_ids = []
val_attention_masks = []
for sent in val_features:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)

train_labels_final = torch.tensor(train_labels_final)
val_labels_final = torch.tensor(val_labels_final)



In [10]:
num_classes = len(list(set(train_labels)))
dataset = TensorDataset(input_ids, attention_masks, train_labels_final)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels_final)

batch_size = 16
train_dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [11]:
model = MultiClassClassifier('AnReu/math_pretrained_roberta', 3, 768, 16, 140, dropout=0.1, freeze_bert=False)
model.cuda()
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

epochs = 20
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

for param in model.bart.encoder.layer[0:5].parameters():
    param.requires_grad = False
loss_func = nn.CrossEntropyLoss()

config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at AnReu/math_pretrained_roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
early_stopping = EarlyStopping(patience=2, verbose=True)
# For each epoch...
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_accuracy = 0
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        if b_labels.dim() == 1:
          b_labels = b_labels.unsqueeze(1).expand(-1, b_input_ids.size(1))
        b_decoder_input_ids = shift_tokens_right(b_labels, pad_token_id=tokenizer.pad_token_id)

        model.zero_grad()

        probas = model(b_input_ids,b_input_mask)
        if b_labels.dim() == 2:
          b_labels = b_labels[:, 0]
        loss = loss_func(probas, b_labels)
        total_train_loss += loss.item()

        loss.backward()

        optimizer.step()

        logits = probas.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print(" Train Accuracy: {0:.2f}".format(avg_train_accuracy))

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        if b_labels.dim() == 1:
          b_labels = b_labels.unsqueeze(1).expand(-1, b_input_ids.size(1))
        b_decoder_input_ids = shift_tokens_right(b_labels, pad_token_id=tokenizer.pad_token_id)

        with torch.no_grad():

          logits = model(b_input_ids,b_input_mask)

        if b_labels.dim() == 2:
          b_labels = b_labels[:, 0]
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
      print("Early stopping")
      break
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    output_dir = 'model_numt5_large/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...




  Batch    40  of    621.    Elapsed: 0:00:15.
  Batch    80  of    621.    Elapsed: 0:00:28.


KeyboardInterrupt: 

In [None]:
!zip -r ./math_roberta.zip model_numt5_large

In [None]:
from google.colab import files
files.download("/content/model.zip")