#Config

In [1]:
class config:
    
    device = None
    seed = 37
    num_workers = 2
    prefetch_factor = 2
    fp16 = True
    warm_up = 0.1
    weight_decay = 0.01
    train_batch_size = 1
    eval_batch_size = 1
    train_epochs = 10
    gradient_accumulation_steps = 6
    adam_epsilon= 1e-6
    adam_betas = (0.9, 0.98)
    learning_rate= 1e-5
    max_grad_norm=0.0
    writer=False
    save_steps=773
    logging_steps=100
    max_step=1000000

    max_seq_length = 256
    load_examples_num_workers = 2

    # pretrained path
    pretrained_model_name_or_path = 'roberta-large'
    pretrained_model_name_or_path_cache = 'pretrained'

    # local paths
    train_data_path = '/content/dataset/train.json'
    val_data_path = '/content/dataset/val.json'
    test_data_path = '/content/dataset/test.json'
    output_path = 'content/output'
    tensor_cache_path = 'content/tensor/'

#Installs and Imports

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.26.1


In [4]:
!git lfs install
!git clone https://huggingface.co/roberta-large

Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'roberta-large'...
remote: Enumerating objects: 69, done.[K
remote: Total 69 (delta 0), reused 0 (delta 0), pack-reused 69[K
Unpacking objects: 100% (69/69), 1.38 MiB | 1.52 MiB/s, done.
Filtering content: 100% (4/4), 5.49 GiB | 43.94 MiB/s, done.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import shutil
 
# Full path of
# the archive file
filename = "/content/drive/MyDrive/Thesis/reclor_dataset.zip"
 
# Target directory
extract_dir = "/content/"
 
# Format of archive file
archive_format = "zip"
 
# Unpack the archive file
shutil.unpack_archive(filename, extract_dir, archive_format)

print("Archive file unpacked successfully.")

Archive file unpacked successfully.


In [7]:
import os
import json
import torch
import logging
import random
import numpy as np

from torch import nn, Tensor
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler
from torch.cuda.amp import GradScaler
from transformers import AutoTokenizer, get_linear_schedule_with_warmup, AdamW, PreTrainedTokenizer
from transformers.modeling_outputs import MultipleChoiceModelOutput
from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel, RobertaConfig, RobertaLMHead

from collections import Counter
from functools import partial
from multiprocessing import Pool
from typing import Dict, List
from nltk import sent_tokenize
from tqdm import tqdm
from abc import ABC

if config.writer:
    from torch.utils.tensorboard import SummaryWriter

logging.basicConfig(level=logging.INFO)

# Model Dependencies

In [8]:
def get_accuracy(logits, labels):
    assert logits.size()[:-1] == labels.size()

    _, pred = logits.max(dim=-1)
    true_label_num = (labels != -1).sum().item()
    correct = (pred == labels).sum().item()
    if true_label_num == 0:
        return 0, 0
    acc = correct * 1.0 / true_label_num
    return acc, true_label_num


class AverageMeter(object):
    """Computes and stores the average and current value."""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if isinstance(val, torch.Tensor):
            val = val.item()
        if isinstance(n, torch.Tensor):
            n = n.item()

        self.val = val
        self.sum += val * n
        self.count += n
        if self.count > 0:
            self.avg = self.sum / self.count
        else:
            self.avg = 0

    def save(self):
        return {
            'val': self.val,
            'avg': self.avg,
            'sum': self.sum,
            'count': self.count
        }

    def load(self, value: dict):
        if value is None:
            self.reset()
        self.val = value['val'] if 'val' in value else 0
        self.avg = value['avg'] if 'avg' in value else 0
        self.sum = value['sum'] if 'sum' in value else 0
        self.count = value['count'] if 'count' in value else 0
        
class LogMetric(object):
    """
    Record all metrics for logging.
    """

    def __init__(self, *metric_names):

        self.metrics = {
            key: AverageMeter() for key in metric_names
        }

    def update(self, metric_name, val, n=1):

        self.metrics[metric_name].update(val, n)

    def reset(self, metric_name=None):
        if metric_name is None:
            for key in self.metrics.keys():
                self.metrics[key].reset()
            return

        self.metrics[metric_name].reset()

    def get_log(self):

        log = {
            key: self.metrics[key].avg for key in self.metrics
        }
        return log

class LogMixin:
    eval_metrics: LogMetric = None

    def init_metric(self, *metric_names):
        self.eval_metrics = LogMetric(*metric_names)

    def get_eval_log(self, reset=False):
        if self.eval_metrics is None:
            print("The `eval_metrics` attribute hasn't been initialized.")

        results = self.eval_metrics.get_log()

        _eval_metric_log = '\t'.join([f"{k}: {v}" for k, v in results.items()])

        if reset:
            self.eval_metrics.reset()

        return _eval_metric_log, results

#Model

In [9]:
class RobertaForMultipleChoice(RobertaPreTrainedModel, LogMixin, ABC):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config: RobertaConfig,
                 re_init_cls: bool = False,
                 fs_checkpoint: bool = False,
                 fs_checkpoint_offload_to_cpu: bool = False,
                 fs_checkpoint_maintain_forward_counter: bool = False,
                 freeze_encoder: bool = False,
                 no_pooler: bool = False):
        super().__init__(config)

        self.roberta = RobertaModel(config)
        self.lm_head = RobertaLMHead(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.re_init_cls = re_init_cls
        if self.re_init_cls:
            self.classifier_i = nn.Linear(config.hidden_size, 1)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.no_pooler = no_pooler
        self.freeze_encoder = freeze_encoder
        print(self.freeze_encoder)
        if freeze_encoder:
            for param in self.roberta.parameters():
                param.requires_grad = False

        self.init_weights()

        self.init_metric("loss", "acc")
        self.transformer = torch.nn.TransformerEncoder(
            torch.nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=8),num_layers=3)

    @staticmethod
    def fold_tensor(x: Tensor):
        if x is None:
            return x
        return x.reshape(-1, x.size(-1))

    def forward(
            self,
            input_ids: Tensor,
            attention_mask: Tensor = None,
            token_type_ids: Tensor = None,
            labels: Tensor = None,
            sentence_index: Tensor = None,
            sentence_mask: Tensor = None,
            sent_token_mask: Tensor = None,
            mlm_labels: Tensor = None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1]

        input_ids = self.fold_tensor(input_ids)
        attention_mask = self.fold_tensor(attention_mask)
        token_type_ids = self.fold_tensor(token_type_ids)

        outputs = self.roberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        if self.no_pooler:
            pooled_output = outputs[0][:, 0]
        else:
            pooled_output = outputs[1]
        
        pooled_output = self.transformer(pooled_output)

        pooled_output = self.dropout(pooled_output)
        if self.re_init_cls:
            logits = self.classifier_i(pooled_output)
        else:
            logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        choice_mask = (attention_mask.sum(dim=-1) == 0).reshape(-1, num_choices)
        reshaped_logits = reshaped_logits + choice_mask * -10000.0

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(reshaped_logits, labels)

            if mlm_labels is not None:
                mlm_scores = self.lm_head(outputs[0])
                mlm_loss = loss_fct(mlm_scores.reshape(-1, self.config.vocab_size), mlm_labels.reshape(-1))
                loss += mlm_loss

            if not self.training:
                acc, true_label_num = get_accuracy(reshaped_logits, labels)
                self.eval_metrics.update("acc", val=acc, n=true_label_num)
                self.eval_metrics.update("loss", val=loss.item(), n=true_label_num)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

#Data Providers

In [10]:
def get_sep_tokens(_tokenizer):
    return [_tokenizer.sep_token] * (_tokenizer.max_len_single_sentence - _tokenizer.max_len_sentences_pair)


def is_bpe(_tokenizer: PreTrainedTokenizer):
    return _tokenizer.__class__.__name__ in [
        "RobertaTokenizer",
        "LongformerTokenizer",
        "BartTokenizer",
        "RobertaTokenizerFast",
        "LongformerTokenizerFast",
        "BartTokenizerFast",
    ]


def load_dataset(config, tokenizer, split='train'):
    if split == 'train':
        file_path = config.train_data_path
    elif split == 'val':
        file_path = config.val_data_path
    elif split == 'test':
        file_path = config.test_data_path
    else:
        raise Exception(split)

    examples, features, tensors = convert_examples_into_features(file_path=file_path,
                                                                 tokenizer=tokenizer,
                                                                 max_seq_length=config.max_seq_length,
                                                                 num_workers=config.load_examples_num_workers,
                                                                suffix=split)
    dataset = TensorDataset(*tensors)
    return dataset, features


def collator(batch):
    if len(batch[0]) == 5:
        input_ids, attention_mask, token_type_ids, labels, sentence_spans = list(zip(*batch))
    elif len(batch[0]) == 4:
        input_ids, attention_mask, labels, sentence_spans = list(zip(*batch))
        token_type_ids = None
    else:
        raise RuntimeError()

    input_ids = torch.stack(input_ids, dim=0)
    attention_mask = torch.stack(attention_mask, dim=0)
    labels = torch.stack(labels, dim=0)
    sentence_spans = torch.stack(sentence_spans, dim=0)

    batch, option_num, _, _ = sentence_spans.size()
    # [batch, option_num, max_sent_num]
    max_sent_len = (sentence_spans[:, :, :, 1] - sentence_spans[:, :, :, 0]).max().item()
    # [batch, option_num, max_sent_num]
    sent_mask = (sentence_spans[:, :, :, 0] != -1)
    # [batch, option_num]
    sent_num = sent_mask.sum(dim=2)
    b_max_sent_num = sent_num.max().item()
    sentence_spans = sentence_spans[:, :, :b_max_sent_num]
    sent_mask = sent_mask[:, :, :b_max_sent_num]

    sentence_index = torch.zeros(batch, option_num, b_max_sent_num, max_sent_len, dtype=torch.long)
    sent_token_mask = torch.zeros(batch, option_num, b_max_sent_num, max_sent_len, dtype=torch.long)
    for b_id, b_spans in enumerate(sentence_spans):
        for op_id, op_spans in enumerate(b_spans):
            for sent_id, span in enumerate(op_spans):
                s, e = span[0].item(), span[1].item()
                if s == -1:
                    break
                _len = e - s
                sentence_index[b_id, op_id, sent_id, :_len] = torch.arange(s, e, dtype=torch.long)
                sent_token_mask[b_id, op_id, sent_id, :_len] = 1

    outputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "sentence_index": sentence_index,
        "sentence_mask": sent_mask,
        "sent_token_mask": sent_token_mask
    }
    if token_type_ids is not None:
        outputs["token_type_ids"] = torch.stack(token_type_ids, dim=0)

    return outputs


def read_examples(file_path: str):
    data = json.load(open(file_path, 'r'))

    examples = []
    for sample in data:
        _context = sample["context"]
        _question = sample["question"]
        if "label" not in sample:
            _label = -1
        else:
            _label = sample["label"]
        examples.append({
            "context": _context,
            "question": _question,
            "options": sample["answers"],
            "label": _label
        })

    print(f"{len(examples)} examples are loaded from {file_path}.")
    return examples


def _convert_example_to_features(example, tokenizer, max_seq_length):
    context = example["context"]
    question = example["question"]
    context_sentences = [sent for sent in sent_tokenize(context) if sent]

    context_tokens = []
    for _sent_id, _sent in enumerate(context_sentences):
        _sent_tokens = tokenizer.tokenize(_sent)
        context_tokens.extend([(_sent_id, _tok) for _tok in _sent_tokens])

    _q_sent_id_offset = len(context_sentences)
    question_tokens = [(_q_sent_id_offset, _tok) for _tok in tokenizer.tokenize(question)]

    features = []
    for option in example["options"]:
        sep_tokens = get_sep_tokens(tokenizer)
        _op_sent_id_offset = _q_sent_id_offset + 1
        opt_tokens = [(_op_sent_id_offset, _tok) for _tok in tokenizer.tokenize(option)]

        lens_to_remove = len(context_tokens) + len(question_tokens) + len(opt_tokens) + len(sep_tokens) + (
                tokenizer.model_max_length - tokenizer.max_len_sentences_pair) - max_seq_length

        tru_c_tokens, tru_q_o_tokens, _ = tokenizer.truncate_sequences(context_tokens,
                                                                       question_tokens + sep_tokens + opt_tokens,
                                                                       num_tokens_to_remove=lens_to_remove,
                                                                       truncation_strategy=TruncationStrategy.LONGEST_FIRST)

        c_tokens, q_op_tokens = [], []
        sent_id_map = Counter()

        for _sent_id, _tok in tru_c_tokens:
            sent_id_map[_sent_id] += 1
            c_tokens.append(_tok)

        for _tok in tru_q_o_tokens:
            if isinstance(_tok, tuple):
                _sent_id, _tok = _tok
                q_op_tokens.append(_tok)
                sent_id_map[_sent_id] += 1
            elif isinstance(_tok, str):
                q_op_tokens.append(_tok)
            else:
                raise RuntimeError(_tok)

        sent_span_offset = 1
        sent_spans = []
        for i in range(len(context_sentences) + 2):
            if i == _q_sent_id_offset or i == _op_sent_id_offset:
                sent_span_offset += (tokenizer.max_len_single_sentence - tokenizer.max_len_sentences_pair)
            if i in sent_id_map:
                _cur_len = sent_id_map.pop(i)
                sent_spans.append((sent_span_offset, sent_span_offset + _cur_len))
                sent_span_offset += _cur_len
        assert not sent_id_map

        tokenizer_outputs = tokenizer(tokenizer.convert_tokens_to_string(c_tokens),
                                      text_pair=tokenizer.convert_tokens_to_string(q_op_tokens),
                                      padding=PaddingStrategy.MAX_LENGTH,
                                      max_length=max_seq_length)
        assert len(tokenizer_outputs["input_ids"]) == max_seq_length, (
        len(c_tokens), len(q_op_tokens), len(tokenizer_outputs["input_ids"]))
        features.append({
            "input_ids": tokenizer_outputs["input_ids"],
            "attention_mask": tokenizer_outputs["attention_mask"],
            "token_type_ids": tokenizer_outputs["token_type_ids"] if "token_type_ids" in tokenizer_outputs else None,
            "sentence_spans": sent_spans,
        })

    return {
        "features": features,
        "label": example["label"]
    }


def _data_to_tensors(features):
    data_num = len(features)
    option_num = len(features[0]["features"])

    input_ids = torch.tensor([[op["input_ids"] for op in f["features"]] for f in features])
    attention_mask = torch.tensor([[op["attention_mask"] for op in f["features"]] for f in features], dtype=torch.long)
    if features[0]["features"][0]["token_type_ids"] is not None:
        token_type_ids = torch.tensor([[op["token_type_ids"] for op in f["features"]] for f in features],
                                      dtype=torch.long)
    else:
        token_type_ids = None
    labels = torch.tensor([f["label"] for f in features], dtype=torch.long)

    # List[List[List[Tuple[int, int]]]]
    sentence_spans_ls = [[op["sentence_spans"] for op in f["features"]] for f in features]
    max_sent_num = 0
    for f in sentence_spans_ls:
        f_max_sent_num = max(map(len, f))
        max_sent_num = max(f_max_sent_num, max_sent_num)

    sentence_spans = torch.zeros(data_num, option_num, max_sent_num, 2, dtype=torch.long).fill_(-1)
    for f_id, f in enumerate(sentence_spans_ls):
        for op_id, op in enumerate(f):
            f_op_sent_num = len(op)
            sentence_spans[f_id, op_id, :f_op_sent_num] = torch.tensor(op, dtype=torch.long)

    if token_type_ids is not None:
        return input_ids, attention_mask, token_type_ids, labels, sentence_spans
    else:
        return input_ids, attention_mask, labels, sentence_spans


def convert_examples_into_features(file_path, tokenizer, max_seq_length, num_workers = 16, suffix=''):
    tokenizer_name = tokenizer.__class__.__name__
    tokenizer_name = tokenizer_name.replace('TokenizerFast', '')
    tokenizer_name = tokenizer_name.replace('Tokenizer', '').lower()

    file_suffix = f"{tokenizer_name}_{max_seq_length}_{suffix}"
    cached_file_path = config.tensor_cache_path + file_suffix

    if os.path.exists(cached_file_path):
        print(f"Loading cached file from {cached_file_path}")
        examples, features, tensors = torch.load(cached_file_path)
        return examples, features, tensors

    examples = read_examples(file_path)

    with Pool(num_workers) as p:
        _annotate = partial(_convert_example_to_features, tokenizer=tokenizer, max_seq_length=max_seq_length)
        features = list(tqdm(
            p.imap(_annotate, examples, chunksize=32),
            total=len(examples),
            desc='converting examples to features:'
        ))

    print("Transform features into tensors...")
    tensors = _data_to_tensors(features)

    print(f"Saving processed features into {cached_file_path}.")
    if not os.path.exists(config.tensor_cache_path):
        os.makedirs(config.tensor_cache_path)
    torch.save((examples, features, tensors), cached_file_path)

    return examples, features, tensors

In [11]:
def forward_and_backward(model, inputs, config, scaler):
    if config.fp16 and scaler:
        with torch.cuda.amp.autocast():
            outputs = model(**inputs)
            loss = outputs["loss"]
    else:
        outputs = model(**inputs)
        loss = outputs["loss"]

    if config.gradient_accumulation_steps > 1:
        loss = loss / config.gradient_accumulation_steps

    if scaler:
        scaler.scale(loss).backward()
    else:
        loss.backward()

    return loss.item()

def batch_to_device(batch, device):
    batch_on_device = {}
    for k, v in batch.items():
        batch_on_device[k] = v.to(device)
    return batch_on_device

def train_model(config, train_dataset,val_dataset, model, tokenizer, start_global_step):
    output_path_split = config.output_path.split('/')
    log_dir = '/'.join([output_path_split[0], 'runs'] + output_path_split[1:])
    if config.writer:
        writer = SummaryWriter(log_dir=log_dir)
    else:
        writer = None

    train_loader = DataLoader(dataset=train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=config.train_batch_size,
                              collate_fn=collator,
                              num_workers=config.num_workers,
                              pin_memory=True,
                              prefetch_factor=config.prefetch_factor)
    
    val_loader = DataLoader(dataset=val_dataset,
                              sampler=RandomSampler(val_dataset),
                              batch_size=config.train_batch_size,
                              collate_fn=collator,
                              num_workers=config.num_workers,
                              pin_memory=True,
                              prefetch_factor=config.prefetch_factor)

    no_decay = ['bias', 'LayerNorm.weight', 'layer_norm.weight']
    grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if
                       (not any(nd in n for nd in no_decay)) and p.requires_grad],
            'weight_decay': config.weight_decay
        },
        {
            'params': [p for n, p in model.named_parameters() if (any(nd in n for nd in no_decay)) and p.requires_grad],
            'weight_decay': 0.0
        }
    ]

    total_steps = len(train_loader) // config.gradient_accumulation_steps * config.train_epochs

    optimizer = AdamW(grouped_parameters,
                      lr=config.learning_rate,
                      eps=config.adam_epsilon,
                      betas=config.adam_betas)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(total_steps * config.warm_up),
                                                num_training_steps=total_steps)

    if config.fp16 and config.device.type == 'cuda':
        scaler = GradScaler()
    else:
        scaler = None

    print(optimizer)
    print("-- Start Training --")
    print("  Num examples = ", len(train_dataset))
    print("  Num Epochs = ", config.train_epochs)
    print("  Batch size = ", config.train_batch_size)
    print("  Gradient Accumulation steps = ", config.gradient_accumulation_steps)
    print("  Total optimization steps = ", total_steps)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)

    for epoch in range(config.train_epochs):
        for step, batch in enumerate(train_loader):

            if global_step < start_global_step:
                if (step + 1) % config.gradient_accumulation_steps == 0:
                    scheduler.step()
                    global_step += 1
                continue
                
            model.train()
            batch = batch_to_device(batch, config.device)
            
            loss = forward_and_backward(model, batch, config, scaler)
            tr_loss += loss

            if (step + 1) % config.gradient_accumulation_steps == 0:

                if scaler:
                    scaler.unscale_(optimizer)

                if config.max_grad_norm:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

                if scaler:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

                scheduler.step()
                model.zero_grad(set_to_none=True)
                global_step += 1

                

                if config.logging_steps > 0 and global_step % config.logging_steps == 0:
                    if config.writer:
                        writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                        writer.add_scalar('loss', (tr_loss - logging_loss) / config.logging_steps, global_step)
                    else:
                        print('gb_step={}, loss={}'.format(global_step, (tr_loss - logging_loss) / config.logging_steps))
                    logging_loss = tr_loss

                if config.save_steps > 0 and global_step % config.save_steps == 0:
                    output_dir = os.path.join(config.output_path, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                        
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    print("Saving model checkpoint to ", output_dir)
                    
            if global_step >= config.max_step:
                break
        
        if global_step >= config.max_step:
            break
        model.eval()
        val_loss = 0.0
        val_acc = 0.0
        num_val_steps = 0
        correct_predictions = 0.0
        pred_list = []
        prob_list = []  
        
        for val_batch in (val_loader):
            batch = batch_to_device(val_batch, config.device)
            with torch.cuda.amp.autocast():
              with torch.no_grad():
                  outputs = model(**batch)
                  val_loss += outputs.loss.item()
                  num_val_steps = num_val_steps +1
                  predictions = outputs.logits.argmax(dim=-1)
                  targets = batch["labels"]
                  correct_predictions += (predictions == targets).sum().item()
        val_loss /= num_val_steps
        
        val_acc = correct_predictions / 500
        print("VAL ACCURACY",val_acc)
        print("val_loss",val_loss)
    return global_step, tr_loss / global_step

In [12]:
def train_main(start_global_step=0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config.device = device
    
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)

    tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name_or_path, cache_dir=config.pretrained_model_name_or_path_cache)
    model = RobertaForMultipleChoice.from_pretrained(config.pretrained_model_name_or_path, cache_dir=config.pretrained_model_name_or_path_cache)

    model.to(config.device)

    train_dataset, features = load_dataset(config, tokenizer=tokenizer, split='train')
    val_dataset, features_val = load_dataset(config, tokenizer=tokenizer, split='val')
    
    step, loss = train_model(config, train_dataset, val_dataset, model, tokenizer, start_global_step)
    print('Train finished, step: {}, loss: {}'.format(step, loss))

In [13]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2023-03-15 07:37:32'

In [14]:
train_main()

False


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['transformer.layers.0.linear1.weight', 'transformer.layers.2.norm1.weight', 'transformer.layers.0.norm2.bias', 'transformer.layers.1.linear2.bias', 'transformer.layers.1.self_attn.in_proj_weight', 'transformer.layers.0.self_attn.out_proj.weight', 'transformer.layers.2.self_attn.out_proj.bias', 'transformer.layers.2.norm1.bias', 'transformer.layers.2.linear2.weight', 'transformer.layers.0.self_attn.in_proj_bias', 'lm_head.decoder.bias', 'transformer.layers.1.linear1.bias', 'transformer.layers.0.norm1.bias', 'transformer.layers.1.norm1.bias', 'transformer.layers.0.linear1.bias', 'transformer.layers.0.norm2.weight', 'transformer.layers.2.self_attn.in_proj_bias', 'transformer.layers.0.linear2.weight', 'classifier.bias', 'transformer.layers.1.norm2.weight', 'transformer.layers.0.linear2.bias', 'transformer.layers.2.norm2.weight', 'transformer.layers.1.self_att

4638 examples are loaded from /content/dataset/train.json.


converting examples to features::   0%|          | 1/4638 [00:00<17:48,  4.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
converting exa

Transform features into tensors...
Saving processed features into content/tensor/roberta_256_train.
500 examples are loaded from /content/dataset/val.json.


converting examples to features:: 100%|██████████| 500/500 [00:01<00:00, 351.03it/s]


Transform features into tensors...
Saving processed features into content/tensor/roberta_256_val.
AdamW (
Parameter Group 0
    betas: (0.9, 0.98)
    correct_bias: True
    eps: 1e-06
    initial_lr: 1e-05
    lr: 0.0
    weight_decay: 0.01

Parameter Group 1
    betas: (0.9, 0.98)
    correct_bias: True
    eps: 1e-06
    initial_lr: 1e-05
    lr: 0.0
    weight_decay: 0.0
)
-- Start Training --
  Num examples =  4638
  Num Epochs =  10
  Batch size =  1
  Gradient Accumulation steps =  6
  Total optimization steps =  7730




gb_step=100, loss=1.4220271337777377
gb_step=200, loss=1.4306135430186986
gb_step=300, loss=1.4172297871112824
gb_step=400, loss=1.4380393886566163
gb_step=500, loss=1.4374519789218902
gb_step=600, loss=1.4584229858219624
gb_step=700, loss=1.4099105264991523
Saving model checkpoint to  content/output/checkpoint-773
VAL ACCURACY 0.242
val_loss 1.3861931869983672
gb_step=800, loss=1.4178339213877917
gb_step=900, loss=1.4224079429358243
gb_step=1000, loss=1.4314471551775931
gb_step=1100, loss=1.394788038879633
gb_step=1200, loss=1.4149359631538392
gb_step=1300, loss=1.423526719212532
gb_step=1400, loss=1.4165117755532264
gb_step=1500, loss=1.4232733055949212
Saving model checkpoint to  content/output/checkpoint-1546
VAL ACCURACY 0.258
val_loss 1.386285457611084
gb_step=1600, loss=1.4103985811769961
gb_step=1700, loss=1.4160938365757465
gb_step=1800, loss=1.4125550366938113
gb_step=1900, loss=1.3961227066069841
gb_step=2000, loss=1.400783576965332
gb_step=2100, loss=1.4046442966163157
gb_s

In [None]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [None]:
#n=3

# False
# Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['transformer.layers.0.linear1.weight', 'transformer.layers.2.norm1.weight', 'transformer.layers.0.norm2.bias', 'transformer.layers.1.linear2.bias', 'transformer.layers.1.self_attn.in_proj_weight', 'transformer.layers.0.self_attn.out_proj.weight', 'transformer.layers.2.self_attn.out_proj.bias', 'transformer.layers.2.norm1.bias', 'transformer.layers.2.linear2.weight', 'transformer.layers.0.self_attn.in_proj_bias', 'lm_head.decoder.bias', 'transformer.layers.1.linear1.bias', 'transformer.layers.0.norm1.bias', 'transformer.layers.1.norm1.bias', 'transformer.layers.0.linear1.bias', 'transformer.layers.0.norm2.weight', 'transformer.layers.2.self_attn.in_proj_bias', 'transformer.layers.0.linear2.weight', 'classifier.bias', 'transformer.layers.1.norm2.weight', 'transformer.layers.0.linear2.bias', 'transformer.layers.2.norm2.weight', 'transformer.layers.1.self_attn.in_proj_bias', 'transformer.layers.0.self_attn.in_proj_weight', 'transformer.layers.1.linear1.weight', 'transformer.layers.2.linear2.bias', 'transformer.layers.1.norm1.weight', 'transformer.layers.1.self_attn.out_proj.bias', 'transformer.layers.0.norm1.weight', 'transformer.layers.2.norm2.bias', 'transformer.layers.1.linear2.weight', 'classifier.weight', 'transformer.layers.2.linear1.bias', 'transformer.layers.0.self_attn.out_proj.bias', 'transformer.layers.1.self_attn.out_proj.weight', 'transformer.layers.1.norm2.bias', 'transformer.layers.2.self_attn.in_proj_weight', 'transformer.layers.2.linear1.weight', 'transformer.layers.2.self_attn.out_proj.weight']
# You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# 4638 examples are loaded from /content/dataset/train.json.
# converting examples to features::   0%|          | 1/4638 [00:00<17:48,  4.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# converting examples to features:: 100%|██████████| 4638/4638 [00:13<00:00, 354.02it/s]
# Transform features into tensors...
# Saving processed features into content/tensor/roberta_256_train.
# 500 examples are loaded from /content/dataset/val.json.
# converting examples to features:: 100%|██████████| 500/500 [00:01<00:00, 351.03it/s]
# Transform features into tensors...
# Saving processed features into content/tensor/roberta_256_val.
# AdamW (
# Parameter Group 0
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.01

# Parameter Group 1
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.0
# )
# -- Start Training --
#   Num examples =  4638
#   Num Epochs =  10
#   Batch size =  1
#   Gradient Accumulation steps =  6
#   Total optimization steps =  7730
# /usr/local/lib/python3.9/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
#   warnings.warn(
# gb_step=100, loss=1.4220271337777377
# gb_step=200, loss=1.4306135430186986
# gb_step=300, loss=1.4172297871112824
# gb_step=400, loss=1.4380393886566163
# gb_step=500, loss=1.4374519789218902
# gb_step=600, loss=1.4584229858219624
# gb_step=700, loss=1.4099105264991523
# Saving model checkpoint to  content/output/checkpoint-773
# VAL ACCURACY 0.242
# val_loss 1.3861931869983672
# gb_step=800, loss=1.4178339213877917
# gb_step=900, loss=1.4224079429358243
# gb_step=1000, loss=1.4314471551775931
# gb_step=1100, loss=1.394788038879633
# gb_step=1200, loss=1.4149359631538392
# gb_step=1300, loss=1.423526719212532
# gb_step=1400, loss=1.4165117755532264
# gb_step=1500, loss=1.4232733055949212
# Saving model checkpoint to  content/output/checkpoint-1546
# VAL ACCURACY 0.258
# val_loss 1.386285457611084
# gb_step=1600, loss=1.4103985811769961
# gb_step=1700, loss=1.4160938365757465
# gb_step=1800, loss=1.4125550366938113
# gb_step=1900, loss=1.3961227066069841
# gb_step=2000, loss=1.400783576965332
# gb_step=2100, loss=1.4046442966163157
# gb_step=2200, loss=1.4027740098536015
# gb_step=2300, loss=1.3988996754586696
# Saving model checkpoint to  content/output/checkpoint-2319
# VAL ACCURACY 0.262
# val_loss 1.3862896053791045
# gb_step=2400, loss=1.398741910457611
# gb_step=2500, loss=1.3895214253664017
# gb_step=2600, loss=1.4017921026051043
# gb_step=2700, loss=1.3988973742723465
# gb_step=2800, loss=1.4022473058104514
# gb_step=2900, loss=1.3768328346312046
# gb_step=3000, loss=1.3965113258361816
# Saving model checkpoint to  content/output/checkpoint-3092
# VAL ACCURACY 0.238
# val_loss 1.3862833788394928
# gb_step=3100, loss=1.395969226360321
# gb_step=3200, loss=1.4045059789717198
# gb_step=3300, loss=1.3994432857632637
# gb_step=3400, loss=1.3956596764922142
# gb_step=3500, loss=1.3916675022244454
# gb_step=3600, loss=1.4060612313449383
# gb_step=3700, loss=1.4017270296812057
# gb_step=3800, loss=1.3935701341927051
# Saving model checkpoint to  content/output/checkpoint-3865
# VAL ACCURACY 0.228
# val_loss 1.3863004684448241
# gb_step=3900, loss=1.3899628183245658
# gb_step=4000, loss=1.401406503468752
# gb_step=4100, loss=1.3930057866871357
# gb_step=4200, loss=1.4040050715208054
# gb_step=4300, loss=1.391129407286644
# gb_step=4400, loss=1.396978533565998
# gb_step=4500, loss=1.3858209225535392
# gb_step=4600, loss=1.396473642885685
# Saving model checkpoint to  content/output/checkpoint-4638
# VAL ACCURACY 0.234
# val_loss 1.3862943649291992
# gb_step=4700, loss=1.4033712983131408
# gb_step=4800, loss=1.400619760453701
# gb_step=4900, loss=1.388340612500906
# gb_step=5000, loss=1.3979533645510673
# gb_step=5100, loss=1.406453200727701
# gb_step=5200, loss=1.4003662779927253
# gb_step=5300, loss=1.3887342363595963
# gb_step=5400, loss=1.3912501215934754
# Saving model checkpoint to  content/output/checkpoint-5411
# VAL ACCURACY 0.234
# val_loss 1.3862943649291992
# gb_step=5500, loss=1.3966173605620862
# gb_step=5600, loss=1.394187645316124
# gb_step=5700, loss=1.4151653808355331
# gb_step=5800, loss=1.38087522611022
# gb_step=5900, loss=1.4017225831747055
# gb_step=6000, loss=1.396518856883049
# gb_step=6100, loss=1.398693494349718
# Saving model checkpoint to  content/output/checkpoint-6184
# VAL ACCURACY 0.236
# val_loss 1.3862938766479491
# gb_step=6200, loss=1.3942405927181243
# gb_step=6300, loss=1.3920213575661182
# gb_step=6400, loss=1.393166634440422
# gb_step=6500, loss=1.3867203126847745
# gb_step=6600, loss=1.4010600340366364
# gb_step=6700, loss=1.393689583837986
# gb_step=6800, loss=1.3909071052074433
# gb_step=6900, loss=1.3929935277998446
# Saving model checkpoint to  content/output/checkpoint-6957
# VAL ACCURACY 0.242
# val_loss 1.3863038864135742
# gb_step=7000, loss=1.402662847787142
# gb_step=7100, loss=1.3923931503295899
# gb_step=7200, loss=1.3959195822477342
# gb_step=7300, loss=1.3992935395240784
# gb_step=7400, loss=1.3902991817891597
# gb_step=7500, loss=1.3955000111460685
# gb_step=7600, loss=1.397827975600958
# gb_step=7700, loss=1.3908411556482314
# Saving model checkpoint to  content/output/checkpoint-7730
# VAL ACCURACY 0.218
# val_loss 1.3862941207885742
# Train finished, step: 7730, loss: 1.402329443950317


# False
# Loading cached file from content/tensor/roberta_256_val
# ***** Running evaluation val on checkpoint-7730 *****
#   Num examples = 500
#   Batch size = 1
# Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.46it/s]****** Evaluation Results ******
# loss: 1.3862941207885742	acc: 0.218

# {'loss': 1.3862941207885742, 'acc': 0.218}

In [None]:
# n = 2


# False
# Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['transformer.layers.1.linear1.bias', 'transformer.layers.0.linear2.weight', 'transformer.layers.1.linear1.weight', 'transformer.layers.0.norm2.bias', 'lm_head.decoder.bias', 'transformer.layers.1.norm1.bias', 'transformer.layers.0.linear2.bias', 'transformer.layers.0.linear1.weight', 'transformer.layers.1.norm2.bias', 'transformer.layers.0.self_attn.in_proj_weight', 'classifier.bias', 'transformer.layers.0.norm1.bias', 'transformer.layers.0.norm1.weight', 'classifier.weight', 'transformer.layers.1.linear2.bias', 'transformer.layers.0.self_attn.out_proj.weight', 'transformer.layers.1.self_attn.out_proj.weight', 'transformer.layers.0.linear1.bias', 'transformer.layers.1.self_attn.out_proj.bias', 'transformer.layers.0.self_attn.out_proj.bias', 'transformer.layers.0.norm2.weight', 'transformer.layers.1.linear2.weight', 'transformer.layers.1.self_attn.in_proj_bias', 'transformer.layers.1.norm1.weight', 'transformer.layers.1.norm2.weight', 'transformer.layers.1.self_attn.in_proj_weight', 'transformer.layers.0.self_attn.in_proj_bias']
# You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# Loading cached file from content/tensor/roberta_256_train
# Loading cached file from content/tensor/roberta_256_val
# AdamW (
# Parameter Group 0
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.01

# Parameter Group 1
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.0
# )
# -- Start Training --
#   Num examples =  4638
#   Num Epochs =  10
#   Batch size =  1
#   Gradient Accumulation steps =  6
#   Total optimization steps =  7730
# gb_step=100, loss=1.4129802197963
# gb_step=200, loss=1.4318473437428474
# gb_step=300, loss=1.4346796453744173
# gb_step=400, loss=1.4149925626814366
# gb_step=500, loss=1.4166777034848927
# gb_step=600, loss=1.4039259718358517
# gb_step=700, loss=1.3945451258867978
# Saving model checkpoint to  content/output/checkpoint-773
# VAL ACCURACY 0.396
# val_loss 1.2999427822828293
# gb_step=800, loss=1.3836550530046225
# gb_step=900, loss=1.3038933566212654
# gb_step=1000, loss=1.2445717301592232
# gb_step=1100, loss=1.2744085831195116
# gb_step=1200, loss=1.2342047434300185
# gb_step=1300, loss=1.2295980157889426
# gb_step=1400, loss=1.205662941262126
# gb_step=1500, loss=1.2650476757762954
# Saving model checkpoint to  content/output/checkpoint-1546
# VAL ACCURACY 0.564
# val_loss 1.0593058998957277
# gb_step=1600, loss=1.0671895540878176
# gb_step=1700, loss=0.943176753083244
# gb_step=1800, loss=0.9356462930397538
# gb_step=1900, loss=0.9608586029317485
# gb_step=2000, loss=0.8958794597818632
# gb_step=2100, loss=0.9984195854282006
# gb_step=2200, loss=0.9437712373738759
# gb_step=2300, loss=0.8922771785239456
# Saving model checkpoint to  content/output/checkpoint-2319
# VAL ACCURACY 0.59
# val_loss 1.03170093537122
# gb_step=2400, loss=0.7087654754442428
# gb_step=2500, loss=0.545726574219193
# gb_step=2600, loss=0.5915492122483783
# gb_step=2700, loss=0.5213909767693713
# gb_step=2800, loss=0.5165307380423474
# gb_step=2900, loss=0.5811337921723816
# gb_step=3000, loss=0.5809983383218378
# Saving model checkpoint to  content/output/checkpoint-3092
# VAL ACCURACY 0.592
# val_loss 1.2280480670480365
# gb_step=3100, loss=0.5278263180467729
# gb_step=3200, loss=0.2565106239793204
# gb_step=3300, loss=0.2709816390892456
# gb_step=3400, loss=0.3015033304344752
# gb_step=3500, loss=0.22226955133589854
# gb_step=3600, loss=0.262236216347701
# gb_step=3700, loss=0.2928941005261322
# gb_step=3800, loss=0.26637418321188305
# Saving model checkpoint to  content/output/checkpoint-3865
# VAL ACCURACY 0.582
# val_loss 1.866295924989208
# gb_step=3900, loss=0.18742364385837845
# gb_step=4000, loss=0.13578957858730972
# gb_step=4100, loss=0.13871735071020794
# gb_step=4200, loss=0.12797836258695042
# gb_step=4300, loss=0.1138564765275487
# gb_step=4400, loss=0.1273428285781756
# gb_step=4500, loss=0.17132347843195475
# gb_step=4600, loss=0.15969888399702994
# Saving model checkpoint to  content/output/checkpoint-4638
# VAL ACCURACY 0.616
# val_loss 1.338731553718024
# gb_step=4700, loss=0.12248323188238829
# gb_step=4800, loss=0.08430103608919126
# gb_step=4900, loss=0.07760209614559244
# gb_step=5000, loss=0.05590328808547383
# gb_step=5100, loss=0.10160650879295645
# gb_step=5200, loss=0.053373459469689806
# gb_step=5300, loss=0.08495888963748711
# gb_step=5400, loss=0.07054377818277771
# Saving model checkpoint to  content/output/checkpoint-5411
# VAL ACCURACY 0.616
# val_loss 2.32277986348785
# gb_step=5500, loss=0.07866832532407443
# gb_step=5600, loss=0.05494236689660283
# gb_step=5700, loss=0.03874914527618785
# gb_step=5800, loss=0.04210433438650398
# gb_step=5900, loss=0.04528434388208552
# gb_step=6000, loss=0.056118501610103524
# gb_step=6100, loss=0.052067095322831845
# Saving model checkpoint to  content/output/checkpoint-6184
# VAL ACCURACY 0.632
# val_loss 2.4164718086491206
# gb_step=6200, loss=0.05323917387489928
# gb_step=6300, loss=0.031226562478695995
# gb_step=6400, loss=0.030830496306166424
# gb_step=6500, loss=0.03793160221406197
# gb_step=6600, loss=0.020642832119960985
# gb_step=6700, loss=0.021525202562575032
# gb_step=6800, loss=0.03382317454453187
# gb_step=6900, loss=0.017552684453817163
# Saving model checkpoint to  content/output/checkpoint-6957
# VAL ACCURACY 0.636
# val_loss 3.0275975006523845
# gb_step=7000, loss=0.056085080532543546
# gb_step=7100, loss=0.02680729048631747
# gb_step=7200, loss=0.021804822288713693
# gb_step=7300, loss=0.014349743432599099
# gb_step=7400, loss=0.03149242952064924
# gb_step=7500, loss=0.018375317249183353
# gb_step=7600, loss=0.035141037192611295
# gb_step=7700, loss=0.015298996409496795
# Saving model checkpoint to  content/output/checkpoint-7730
# VAL ACCURACY 0.63
# val_loss 2.7849270778263118
# Train finished, step: 7730, loss: 0.4759977845384162


# False
# Loading cached file from content/tensor/roberta_256_val
# ***** Running evaluation val on checkpoint-7730 *****
#   Num examples = 500
#   Batch size = 1
# Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.19it/s]****** Evaluation Results ******
# loss: 2.7849270778263118	acc: 0.63

# {'loss': 2.7849270778263118, 'acc': 0.63}

In [None]:
#no of layers = 1


# False

# Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'transformer.layers.0.linear1.bias', 'transformer.layers.0.self_attn.out_proj.bias', 'transformer.layers.0.linear2.weight', 'transformer.layers.0.norm1.bias', 'transformer.layers.0.norm2.weight', 'transformer.layers.0.norm2.bias', 'transformer.layers.0.norm1.weight', 'classifier.weight', 'lm_head.decoder.bias', 'transformer.layers.0.self_attn.out_proj.weight', 'transformer.layers.0.linear2.bias', 'transformer.layers.0.linear1.weight', 'transformer.layers.0.self_attn.in_proj_bias', 'transformer.layers.0.self_attn.in_proj_weight']
# You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# Loading cached file from content/tensor/roberta_256_train
# Loading cached file from content/tensor/roberta_256_val
# AdamW (
# Parameter Group 0
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.01

# Parameter Group 1
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.0
# )
# -- Start Training --
#   Num examples =  4638
#   Num Epochs =  10
#   Batch size =  1
#   Gradient Accumulation steps =  6
#   Total optimization steps =  7730
# gb_step=100, loss=1.4000511837005616
# gb_step=200, loss=1.4087094267457723
# gb_step=300, loss=1.4082425244152545
# gb_step=400, loss=1.4014829351007938
# gb_step=500, loss=1.41468473598361
# gb_step=600, loss=1.398058703094721
# gb_step=700, loss=1.315481859445572
# Saving model checkpoint to  content/output/checkpoint-773
# VAL ACCURACY 0.406
# val_loss 1.2675443164110183
# gb_step=800, loss=1.313873561359942
# gb_step=900, loss=1.266162817068398
# gb_step=1000, loss=1.185057722022757
# gb_step=1100, loss=1.2351347482670099
# gb_step=1200, loss=1.231312368484214
# gb_step=1300, loss=1.171678290842101
# gb_step=1400, loss=1.1477483737003058
# gb_step=1500, loss=1.2202631201664917
# Saving model checkpoint to  content/output/checkpoint-1546
# VAL ACCURACY 0.582
# val_loss 1.0366155510097743
# gb_step=1600, loss=0.9982862411573297
# gb_step=1700, loss=0.8516298317493056
# gb_step=1800, loss=0.8605678417626769
# gb_step=1900, loss=0.8172711919856375
# gb_step=2000, loss=0.8023719936613634
# gb_step=2100, loss=0.8865502655471209
# gb_step=2200, loss=0.7525935551508155
# gb_step=2300, loss=0.7881794190541769
# Saving model checkpoint to  content/output/checkpoint-2319
# VAL ACCURACY 0.648
# val_loss 0.8833583958689123
# gb_step=2400, loss=0.4999720555879776
# gb_step=2500, loss=0.32874815384366685
# gb_step=2600, loss=0.41166657728021616
# gb_step=2700, loss=0.38883455557921937
# gb_step=2800, loss=0.34428003396165424
# gb_step=2900, loss=0.4139391315065041
# gb_step=3000, loss=0.38009933607783297
# Saving model checkpoint to  content/output/checkpoint-3092
# VAL ACCURACY 0.64
# val_loss 1.2841281909116171
# gb_step=3100, loss=0.3331936953895274
# gb_step=3200, loss=0.1786620110612739
# gb_step=3300, loss=0.16228435609571534
# gb_step=3400, loss=0.15572815764060124
# gb_step=3500, loss=0.1218435783016912
# gb_step=3600, loss=0.20803053533983984
# gb_step=3700, loss=0.1734171282330226
# gb_step=3800, loss=0.18009904585092498
# Saving model checkpoint to  content/output/checkpoint-3865
# VAL ACCURACY 0.63
# val_loss 1.6421874055338026
# gb_step=3900, loss=0.18047832556906088
# gb_step=4000, loss=0.09405392236916668
# gb_step=4100, loss=0.0873175339776526
# gb_step=4200, loss=0.07237956216311431
# gb_step=4300, loss=0.08755751216625413
# gb_step=4400, loss=0.0866660441885324
# gb_step=4500, loss=0.11421677386971624
# gb_step=4600, loss=0.10245213569271983
# Saving model checkpoint to  content/output/checkpoint-4638
# VAL ACCURACY 0.644
# val_loss 1.6177930664099485
# gb_step=4700, loss=0.06669524167445616
# gb_step=4800, loss=0.030722298842788404
# gb_step=4900, loss=0.051444575391797115
# gb_step=5000, loss=0.05056144287096686
# gb_step=5100, loss=0.06056139202659324
# gb_step=5200, loss=0.0657921288390571
# gb_step=5300, loss=0.04831415588429536
# gb_step=5400, loss=0.0369429677979133
# Saving model checkpoint to  content/output/checkpoint-5411
# VAL ACCURACY 0.616
# val_loss 2.376810825590086
# gb_step=5500, loss=0.0424491049665221
# gb_step=5600, loss=0.04309585521686131
# gb_step=5700, loss=0.05105609894078043
# gb_step=5800, loss=0.06303215634062781
# gb_step=5900, loss=0.039817520557344326
# gb_step=6000, loss=0.020088529302902315
# gb_step=6100, loss=0.02134371507335345
# Saving model checkpoint to  content/output/checkpoint-6184
# VAL ACCURACY 0.64
# val_loss 2.578201453592295
# gb_step=6200, loss=0.02988293205205082
# gb_step=6300, loss=0.01704156976978993
# gb_step=6400, loss=0.018354150128479886
# gb_step=6500, loss=0.014729186734361974
# gb_step=6600, loss=0.01196393845935745
# gb_step=6700, loss=0.023793425712415227
# gb_step=6800, loss=0.01887372425935155
# gb_step=6900, loss=0.013763183898022361
# Saving model checkpoint to  content/output/checkpoint-6957
# VAL ACCURACY 0.636
# val_loss 2.881126556598182
# gb_step=7000, loss=0.037648314665361796
# gb_step=7100, loss=0.02445314686744041
# gb_step=7200, loss=0.014031902801971228
# gb_step=7300, loss=0.0048920546207864395
# gb_step=7400, loss=0.011276241397818012
# gb_step=7500, loss=0.01608119052385064
# gb_step=7600, loss=0.01841945526276504
# gb_step=7700, loss=0.01956907506920288
# Saving model checkpoint to  content/output/checkpoint-7730
# VAL ACCURACY 0.646
# val_loss 2.712272958549084
# Train finished, step: 7730, loss: 0.41881199269608804

# False
# Loading cached file from content/tensor/roberta_256_val
# ***** Running evaluation val on checkpoint-7730 *****
#   Num examples = 500
#   Batch size = 1
# Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.25it/s]****** Evaluation Results ******
# loss: 2.712272958549084	acc: 0.646

# {'loss': 2.712272958549084, 'acc': 0.646}

In [33]:
# Number of layers = 4


# False
# Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['transformer.layers.2.self_attn.out_proj.weight', 'transformer.layers.2.linear2.weight', 'transformer.layers.1.linear1.bias', 'transformer.layers.2.self_attn.in_proj_weight', 'transformer.layers.0.linear2.weight', 'transformer.layers.1.linear1.weight', 'transformer.layers.0.norm2.bias', 'transformer.layers.3.linear2.bias', 'lm_head.decoder.bias', 'transformer.layers.2.norm1.weight', 'transformer.layers.1.norm1.bias', 'transformer.layers.0.linear2.bias', 'transformer.layers.3.norm1.bias', 'transformer.layers.3.linear2.weight', 'transformer.layers.0.linear1.weight', 'transformer.layers.1.norm2.bias', 'transformer.layers.0.self_attn.in_proj_weight', 'classifier.bias', 'transformer.layers.3.linear1.bias', 'transformer.layers.2.linear2.bias', 'transformer.layers.0.norm1.bias', 'transformer.layers.0.norm1.weight', 'transformer.layers.3.self_attn.in_proj_bias', 'classifier.weight', 'transformer.layers.1.linear2.bias', 'transformer.layers.3.self_attn.out_proj.weight', 'transformer.layers.0.self_attn.out_proj.weight', 'transformer.layers.2.self_attn.out_proj.bias', 'transformer.layers.1.self_attn.out_proj.weight', 'transformer.layers.2.norm1.bias', 'transformer.layers.0.linear1.bias', 'transformer.layers.1.self_attn.out_proj.bias', 'transformer.layers.0.self_attn.out_proj.bias', 'transformer.layers.2.self_attn.in_proj_bias', 'transformer.layers.0.norm2.weight', 'transformer.layers.1.linear2.weight', 'transformer.layers.2.linear1.bias', 'transformer.layers.2.norm2.bias', 'transformer.layers.1.norm1.weight', 'transformer.layers.1.self_attn.in_proj_bias', 'transformer.layers.1.norm2.weight', 'transformer.layers.3.norm1.weight', 'transformer.layers.3.linear1.weight', 'transformer.layers.1.self_attn.in_proj_weight', 'transformer.layers.3.norm2.bias', 'transformer.layers.3.norm2.weight', 'transformer.layers.2.linear1.weight', 'transformer.layers.2.norm2.weight', 'transformer.layers.3.self_attn.out_proj.bias', 'transformer.layers.0.self_attn.in_proj_bias', 'transformer.layers.3.self_attn.in_proj_weight']
# You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# 4638 examples are loaded from /content/dataset/train.json.
# converting examples to features::   0%|          | 1/4638 [00:00<13:51,  5.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
# converting examples to features:: 100%|██████████| 4638/4638 [00:13<00:00, 345.54it/s]
# Transform features into tensors...
# Saving processed features into content/tensor/roberta_256_train.
# 500 examples are loaded from /content/dataset/val.json.
# converting examples to features:: 100%|██████████| 500/500 [00:01<00:00, 333.86it/s]
# Transform features into tensors...
# Saving processed features into content/tensor/roberta_256_val.
# AdamW (
# Parameter Group 0
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.01

# Parameter Group 1
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.0
# )
# -- Start Training --
#   Num examples =  4638
#   Num Epochs =  10
#   Batch size =  1
#   Gradient Accumulation steps =  6
#   Total optimization steps =  7730
# /usr/local/lib/python3.9/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
#   warnings.warn(
# gb_step=100, loss=1.4331135093420744
# gb_step=200, loss=1.4220289275050164
# gb_step=300, loss=1.4180782268196344
# gb_step=400, loss=1.4372704143822193
# gb_step=500, loss=1.4337283670157195
# gb_step=600, loss=1.4238057371973991
# gb_step=700, loss=1.4276349085569382
# Saving model checkpoint to  content/output/checkpoint-773
# VAL ACCURACY 0.33
# val_loss 1.3800387234687805
# gb_step=800, loss=1.41030326128006
# gb_step=900, loss=1.3722172626852989
# gb_step=1000, loss=1.297245130110532
# gb_step=1100, loss=1.3237402828410267
# gb_step=1200, loss=1.3311316873133183
# gb_step=1300, loss=1.321453334391117
# gb_step=1400, loss=1.2861185969784856
# gb_step=1500, loss=1.3019532654806971
# Saving model checkpoint to  content/output/checkpoint-1546
# VAL ACCURACY 0.514
# val_loss 1.149497543424368
# gb_step=1600, loss=1.2026377533283084
# gb_step=1700, loss=1.1140885019791313
# gb_step=1800, loss=1.082077877513948
# gb_step=1900, loss=1.21089129882399
# gb_step=2000, loss=1.1114371174527333
# gb_step=2100, loss=1.1922546204924585
# gb_step=2200, loss=1.190710430983454
# gb_step=2300, loss=1.1025122589804233
# Saving model checkpoint to  content/output/checkpoint-2319
# VAL ACCURACY 0.538
# val_loss 1.0964929539114237
# gb_step=2400, loss=0.9153706377600611
# gb_step=2500, loss=0.7984684072827803
# gb_step=2600, loss=0.8284691839496372
# gb_step=2700, loss=0.8577156063033908
# gb_step=2800, loss=0.8218382442271104
# gb_step=2900, loss=0.8819021373719442
# gb_step=3000, loss=0.7966289839649835
# Saving model checkpoint to  content/output/checkpoint-3092
# VAL ACCURACY 0.574
# val_loss 1.0939610498630792
# gb_step=3100, loss=0.795287901492411
# gb_step=3200, loss=0.46524256250856977
# gb_step=3300, loss=0.4888449331302263
# gb_step=3400, loss=0.506321217840532
# gb_step=3500, loss=0.42824045587727594
# gb_step=3600, loss=0.4402698888440682
# gb_step=3700, loss=0.47443109675280537
# gb_step=3800, loss=0.4529381483706675
# Saving model checkpoint to  content/output/checkpoint-3865
# VAL ACCURACY 0.572
# val_loss 1.451272729158213
# gb_step=3900, loss=0.3475282168170406
# gb_step=4000, loss=0.21660687734098247
# gb_step=4100, loss=0.17217511211477357
# gb_step=4200, loss=0.2317424284532899
# gb_step=4300, loss=0.27608105323573456
# gb_step=4400, loss=0.21178361173630036
# gb_step=4500, loss=0.2728603674059468
# gb_step=4600, loss=0.24748863258456366
# Saving model checkpoint to  content/output/checkpoint-4638
# VAL ACCURACY 0.582
# val_loss 1.6783074035721062
# gb_step=4700, loss=0.16987136223799099
# gb_step=4800, loss=0.1241031950989418
# gb_step=4900, loss=0.14365500520680144
# gb_step=5000, loss=0.12489390793275561
# gb_step=5100, loss=0.15370621415145252
# gb_step=5200, loss=0.11459609533093498
# gb_step=5300, loss=0.14956984983155963
# gb_step=5400, loss=0.11521948983284347
# Saving model checkpoint to  content/output/checkpoint-5411
# VAL ACCURACY 0.56
# val_loss 2.086846945371086
# gb_step=5500, loss=0.05113689343066653
# gb_step=5600, loss=0.08325756188281047
# gb_step=5700, loss=0.0877089665017229
# gb_step=5800, loss=0.07208247899204252
# gb_step=5900, loss=0.07037320874019315
# gb_step=6000, loss=0.0577920712807736
# gb_step=6100, loss=0.056063273537702116
# Saving model checkpoint to  content/output/checkpoint-6184
# VAL ACCURACY 0.606
# val_loss 2.4201053474366985
# gb_step=6200, loss=0.09348699980917445
# gb_step=6300, loss=0.053945297236678014
# gb_step=6400, loss=0.03315237810169492
# gb_step=6500, loss=0.0755817533548543
# gb_step=6600, loss=0.05686325824241067
# gb_step=6700, loss=0.03626635163778701
# gb_step=6800, loss=0.05059097956665937
# gb_step=6900, loss=0.025623285742503867
# Saving model checkpoint to  content/output/checkpoint-6957
# VAL ACCURACY 0.588
# val_loss 2.822189184845969
# gb_step=7000, loss=0.042717120013703605
# gb_step=7100, loss=0.031883481578734066
# gb_step=7200, loss=0.0326515353749528
# gb_step=7300, loss=0.023862262778666262
# gb_step=7400, loss=0.035513384172109
# gb_step=7500, loss=0.031271072078352516
# gb_step=7600, loss=0.020224376364840282
# gb_step=7700, loss=0.027177321694607598
# Saving model checkpoint to  content/output/checkpoint-7730
# VAL ACCURACY 0.594
# val_loss 3.0306156803070863
# Train finished, step: 7730, loss: 0.5694952984935737


# False
# Loading cached file from content/tensor/roberta_256_val
# ***** Running evaluation val on checkpoint-7730 *****
#   Num examples = 500
#   Batch size = 1
# Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.28it/s]****** Evaluation Results ******
# loss: 3.0306156803070854	acc: 0.594

# {'loss': 3.0306156803070854, 'acc': 0.594}

In [16]:
def evaluate(config, split="val", checkpoint_name=''):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config.device = device
    tokenizer = AutoTokenizer.from_pretrained(config.output_path + '/' + checkpoint_name)
    model = RobertaForMultipleChoice.from_pretrained(config.output_path + '/' + checkpoint_name)

    model.to(config.device)

    dataset, features = load_dataset(config, tokenizer, split=split)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=config.eval_batch_size,
                                  collate_fn=collator)

    torch.cuda.empty_cache()
    print("***** Running evaluation {} on {} *****".format(split, checkpoint_name))
    print("  Num examples =", len(dataset))
    print("  Batch size =", config.eval_batch_size)

    model.eval()
    pred_list = []
    prob_list = []

    for batch in tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True):
        batch = batch_to_device(batch, config.device)
        with torch.cuda.amp.autocast():
            with torch.no_grad():
                outputs = model(**batch)
                probs = outputs["logits"].softmax(dim=-1).detach().float().cpu()
                prob, pred = probs.max(dim=-1)
                pred_list.extend(pred.tolist())
                prob_list.extend(prob.tolist())
  
    metric_log, results = model.get_eval_log(reset=True)
    print("****** Evaluation Results ******")
    print(metric_log)
    
    prediction_file = os.path.join(config.output_path, "{checkpoint_name}_eval_predictions.npy")
    np.save(prediction_file, pred_list)
    json.dump(prob_list, open(os.path.join(config.output_path, "{checkpoint_name}_eval_probs.json"), "w"))
    return results

In [17]:
evaluate(config, checkpoint_name='checkpoint-7730')

False
Loading cached file from content/tensor/roberta_256_val
***** Running evaluation val on checkpoint-7730 *****
  Num examples = 500
  Batch size = 1


Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.46it/s]

****** Evaluation Results ******
loss: 1.3862941207885742	acc: 0.218





{'loss': 1.3862941207885742, 'acc': 0.218}

In [36]:
import os, shutil
def make_archive(source, destination):
        base = os.path.basename(destination)
        name = base.split('.')[0]
        format = base.split('.')[1]
        archive_from = os.path.dirname(source)
        archive_to = os.path.basename(source.strip(os.sep))
        shutil.make_archive(name, format, archive_from, archive_to)
        shutil.move('%s.%s'%(name,format), destination)

make_archive('/content/content', '/content/drive/MyDrive/Thesis/Reclore/13march2023_res_optimizer_change.zip')

KeyboardInterrupt: ignored