#Config

In [1]:
class config:
    
    device = None
    seed = 37
    num_workers = 2
    prefetch_factor = 2
    fp16 = True
    warm_up = 0.1
    weight_decay = 0.01
    train_batch_size = 1
    eval_batch_size = 1
    train_epochs = 10
    gradient_accumulation_steps = 6
    adam_epsilon= 1e-6
    adam_betas = (0.9, 0.98)
    learning_rate= 1e-5
    max_grad_norm=0.0
    writer=False
    save_steps=773
    logging_steps=100
    max_step=1000000

    max_seq_length = 256
    load_examples_num_workers = 2

    # pretrained path
    pretrained_model_name_or_path = 'roberta-large'
    pretrained_model_name_or_path_cache = 'pretrained'

    # local paths
    train_data_path = '/content/dataset/train.json'
    val_data_path = '/content/dataset/val.json'
    test_data_path = '/content/dataset/test.json'
    output_path = 'content/output'
    tensor_cache_path = 'content/tensor/'

#Installs and Imports

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [4]:
!git lfs install
!git clone https://huggingface.co/roberta-large

Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'roberta-large'...
remote: Enumerating objects: 69, done.[K
remote: Total 69 (delta 0), reused 0 (delta 0), pack-reused 69[K
Unpacking objects: 100% (69/69), 1.38 MiB | 1.83 MiB/s, done.
Filtering content: 100% (4/4), 5.49 GiB | 67.00 MiB/s, done.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import shutil
 
# Full path of
# the archive file
filename = "/content/drive/MyDrive/Thesis/reclor_dataset.zip"
 
# Target directory
extract_dir = "/content/"
 
# Format of archive file
archive_format = "zip"
 
# Unpack the archive file
shutil.unpack_archive(filename, extract_dir, archive_format)

print("Archive file unpacked successfully.")

Archive file unpacked successfully.


In [7]:
import os
import json
import torch
import logging
import random
import numpy as np

from torch import nn, Tensor
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler
from torch.cuda.amp import GradScaler
from transformers import AutoTokenizer, get_linear_schedule_with_warmup, AdamW, PreTrainedTokenizer
from transformers.modeling_outputs import MultipleChoiceModelOutput
from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel, RobertaConfig, RobertaLMHead

from collections import Counter
from functools import partial
from multiprocessing import Pool
from typing import Dict, List
from nltk import sent_tokenize
from tqdm import tqdm
from abc import ABC

if config.writer:
    from torch.utils.tensorboard import SummaryWriter

logging.basicConfig(level=logging.INFO)

# Model Dependencies

In [8]:
def get_accuracy(logits, labels):
    assert logits.size()[:-1] == labels.size()

    _, pred = logits.max(dim=-1)
    true_label_num = (labels != -1).sum().item()
    correct = (pred == labels).sum().item()
    if true_label_num == 0:
        return 0, 0
    acc = correct * 1.0 / true_label_num
    return acc, true_label_num


class AverageMeter(object):
    """Computes and stores the average and current value."""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if isinstance(val, torch.Tensor):
            val = val.item()
        if isinstance(n, torch.Tensor):
            n = n.item()

        self.val = val
        self.sum += val * n
        self.count += n
        if self.count > 0:
            self.avg = self.sum / self.count
        else:
            self.avg = 0

    def save(self):
        return {
            'val': self.val,
            'avg': self.avg,
            'sum': self.sum,
            'count': self.count
        }

    def load(self, value: dict):
        if value is None:
            self.reset()
        self.val = value['val'] if 'val' in value else 0
        self.avg = value['avg'] if 'avg' in value else 0
        self.sum = value['sum'] if 'sum' in value else 0
        self.count = value['count'] if 'count' in value else 0
        
class LogMetric(object):
    """
    Record all metrics for logging.
    """

    def __init__(self, *metric_names):

        self.metrics = {
            key: AverageMeter() for key in metric_names
        }

    def update(self, metric_name, val, n=1):

        self.metrics[metric_name].update(val, n)

    def reset(self, metric_name=None):
        if metric_name is None:
            for key in self.metrics.keys():
                self.metrics[key].reset()
            return

        self.metrics[metric_name].reset()

    def get_log(self):

        log = {
            key: self.metrics[key].avg for key in self.metrics
        }
        return log

class LogMixin:
    eval_metrics: LogMetric = None

    def init_metric(self, *metric_names):
        self.eval_metrics = LogMetric(*metric_names)

    def get_eval_log(self, reset=False):
        if self.eval_metrics is None:
            print("The `eval_metrics` attribute hasn't been initialized.")

        results = self.eval_metrics.get_log()

        _eval_metric_log = '\t'.join([f"{k}: {v}" for k, v in results.items()])

        if reset:
            self.eval_metrics.reset()

        return _eval_metric_log, results

#Model

In [9]:
class RobertaForMultipleChoice(RobertaPreTrainedModel, LogMixin, ABC):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config: RobertaConfig,
                 re_init_cls: bool = False,
                 fs_checkpoint: bool = False,
                 fs_checkpoint_offload_to_cpu: bool = False,
                 fs_checkpoint_maintain_forward_counter: bool = False,
                 freeze_encoder: bool = False,
                 no_pooler: bool = False):
        super().__init__(config)

        self.roberta = RobertaModel(config)
        self.lm_head = RobertaLMHead(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.re_init_cls = re_init_cls
        if self.re_init_cls:
            self.classifier_i = nn.Linear(config.hidden_size, 1)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.no_pooler = no_pooler
        self.freeze_encoder = freeze_encoder
        print(self.freeze_encoder)
        if freeze_encoder:
            for param in self.roberta.parameters():
                param.requires_grad = False

        self.init_weights()

        self.init_metric("loss", "acc")

    @staticmethod
    def fold_tensor(x: Tensor):
        if x is None:
            return x
        return x.reshape(-1, x.size(-1))

    def forward(
            self,
            input_ids: Tensor,
            attention_mask: Tensor = None,
            token_type_ids: Tensor = None,
            labels: Tensor = None,
            sentence_index: Tensor = None,
            sentence_mask: Tensor = None,
            sent_token_mask: Tensor = None,
            mlm_labels: Tensor = None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1]

        input_ids = self.fold_tensor(input_ids)
        attention_mask = self.fold_tensor(attention_mask)
        token_type_ids = self.fold_tensor(token_type_ids)

        outputs = self.roberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        if self.no_pooler:
            pooled_output = outputs[0][:, 0]
        else:
            pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        if self.re_init_cls:
            logits = self.classifier_i(pooled_output)
        else:
            logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        choice_mask = (attention_mask.sum(dim=-1) == 0).reshape(-1, num_choices)
        reshaped_logits = reshaped_logits + choice_mask * -10000.0

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(reshaped_logits, labels)

            if mlm_labels is not None:
                mlm_scores = self.lm_head(outputs[0])
                mlm_loss = loss_fct(mlm_scores.reshape(-1, self.config.vocab_size), mlm_labels.reshape(-1))
                loss += mlm_loss

            if not self.training:
                acc, true_label_num = get_accuracy(reshaped_logits, labels)
                self.eval_metrics.update("acc", val=acc, n=true_label_num)
                self.eval_metrics.update("loss", val=loss.item(), n=true_label_num)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

#Data Providers

In [10]:
def get_sep_tokens(_tokenizer):
    return [_tokenizer.sep_token] * (_tokenizer.max_len_single_sentence - _tokenizer.max_len_sentences_pair)


def is_bpe(_tokenizer: PreTrainedTokenizer):
    return _tokenizer.__class__.__name__ in [
        "RobertaTokenizer",
        "LongformerTokenizer",
        "BartTokenizer",
        "RobertaTokenizerFast",
        "LongformerTokenizerFast",
        "BartTokenizerFast",
    ]


def load_dataset(config, tokenizer, split='train'):
    if split == 'train':
        file_path = config.train_data_path
    elif split == 'val':
        file_path = config.val_data_path
    elif split == 'test':
        file_path = config.test_data_path
    else:
        raise Exception(split)

    examples, features, tensors = convert_examples_into_features(file_path=file_path,
                                                                 tokenizer=tokenizer,
                                                                 max_seq_length=config.max_seq_length,
                                                                 num_workers=config.load_examples_num_workers,
                                                                suffix=split)
    dataset = TensorDataset(*tensors)
    return dataset, features


def collator(batch):
    if len(batch[0]) == 5:
        input_ids, attention_mask, token_type_ids, labels, sentence_spans = list(zip(*batch))
    elif len(batch[0]) == 4:
        input_ids, attention_mask, labels, sentence_spans = list(zip(*batch))
        token_type_ids = None
    else:
        raise RuntimeError()

    input_ids = torch.stack(input_ids, dim=0)
    attention_mask = torch.stack(attention_mask, dim=0)
    labels = torch.stack(labels, dim=0)
    sentence_spans = torch.stack(sentence_spans, dim=0)

    batch, option_num, _, _ = sentence_spans.size()
    # [batch, option_num, max_sent_num]
    max_sent_len = (sentence_spans[:, :, :, 1] - sentence_spans[:, :, :, 0]).max().item()
    # [batch, option_num, max_sent_num]
    sent_mask = (sentence_spans[:, :, :, 0] != -1)
    # [batch, option_num]
    sent_num = sent_mask.sum(dim=2)
    b_max_sent_num = sent_num.max().item()
    sentence_spans = sentence_spans[:, :, :b_max_sent_num]
    sent_mask = sent_mask[:, :, :b_max_sent_num]

    sentence_index = torch.zeros(batch, option_num, b_max_sent_num, max_sent_len, dtype=torch.long)
    sent_token_mask = torch.zeros(batch, option_num, b_max_sent_num, max_sent_len, dtype=torch.long)
    for b_id, b_spans in enumerate(sentence_spans):
        for op_id, op_spans in enumerate(b_spans):
            for sent_id, span in enumerate(op_spans):
                s, e = span[0].item(), span[1].item()
                if s == -1:
                    break
                _len = e - s
                sentence_index[b_id, op_id, sent_id, :_len] = torch.arange(s, e, dtype=torch.long)
                sent_token_mask[b_id, op_id, sent_id, :_len] = 1

    outputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "sentence_index": sentence_index,
        "sentence_mask": sent_mask,
        "sent_token_mask": sent_token_mask
    }
    if token_type_ids is not None:
        outputs["token_type_ids"] = torch.stack(token_type_ids, dim=0)

    return outputs


def read_examples(file_path: str):
    data = json.load(open(file_path, 'r'))

    examples = []
    for sample in data:
        _context = sample["context"]
        _question = sample["question"]
        if "label" not in sample:
            _label = -1
        else:
            _label = sample["label"]
        examples.append({
            "context": _context,
            "question": _question,
            "options": sample["answers"],
            "label": _label
        })

    print(f"{len(examples)} examples are loaded from {file_path}.")
    return examples


def _convert_example_to_features(example, tokenizer, max_seq_length):
    context = example["context"]
    question = example["question"]
    context_sentences = [sent for sent in sent_tokenize(context) if sent]

    context_tokens = []
    for _sent_id, _sent in enumerate(context_sentences):
        _sent_tokens = tokenizer.tokenize(_sent)
        context_tokens.extend([(_sent_id, _tok) for _tok in _sent_tokens])

    _q_sent_id_offset = len(context_sentences)
    question_tokens = [(_q_sent_id_offset, _tok) for _tok in tokenizer.tokenize(question)]

    features = []
    for option in example["options"]:
        sep_tokens = get_sep_tokens(tokenizer)
        _op_sent_id_offset = _q_sent_id_offset + 1
        opt_tokens = [(_op_sent_id_offset, _tok) for _tok in tokenizer.tokenize(option)]

        lens_to_remove = len(context_tokens) + len(question_tokens) + len(opt_tokens) + len(sep_tokens) + (
                tokenizer.model_max_length - tokenizer.max_len_sentences_pair) - max_seq_length

        tru_c_tokens, tru_q_o_tokens, _ = tokenizer.truncate_sequences(context_tokens,
                                                                       question_tokens + sep_tokens + opt_tokens,
                                                                       num_tokens_to_remove=lens_to_remove,
                                                                       truncation_strategy=TruncationStrategy.LONGEST_FIRST)

        c_tokens, q_op_tokens = [], []
        sent_id_map = Counter()

        for _sent_id, _tok in tru_c_tokens:
            sent_id_map[_sent_id] += 1
            c_tokens.append(_tok)

        for _tok in tru_q_o_tokens:
            if isinstance(_tok, tuple):
                _sent_id, _tok = _tok
                q_op_tokens.append(_tok)
                sent_id_map[_sent_id] += 1
            elif isinstance(_tok, str):
                q_op_tokens.append(_tok)
            else:
                raise RuntimeError(_tok)

        sent_span_offset = 1
        sent_spans = []
        for i in range(len(context_sentences) + 2):
            if i == _q_sent_id_offset or i == _op_sent_id_offset:
                sent_span_offset += (tokenizer.max_len_single_sentence - tokenizer.max_len_sentences_pair)
            if i in sent_id_map:
                _cur_len = sent_id_map.pop(i)
                sent_spans.append((sent_span_offset, sent_span_offset + _cur_len))
                sent_span_offset += _cur_len
        assert not sent_id_map

        tokenizer_outputs = tokenizer(tokenizer.convert_tokens_to_string(c_tokens),
                                      text_pair=tokenizer.convert_tokens_to_string(q_op_tokens),
                                      padding=PaddingStrategy.MAX_LENGTH,
                                      max_length=max_seq_length)
        assert len(tokenizer_outputs["input_ids"]) == max_seq_length, (
        len(c_tokens), len(q_op_tokens), len(tokenizer_outputs["input_ids"]))
        features.append({
            "input_ids": tokenizer_outputs["input_ids"],
            "attention_mask": tokenizer_outputs["attention_mask"],
            "token_type_ids": tokenizer_outputs["token_type_ids"] if "token_type_ids" in tokenizer_outputs else None,
            "sentence_spans": sent_spans,
        })

    return {
        "features": features,
        "label": example["label"]
    }


def _data_to_tensors(features):
    data_num = len(features)
    option_num = len(features[0]["features"])

    input_ids = torch.tensor([[op["input_ids"] for op in f["features"]] for f in features])
    attention_mask = torch.tensor([[op["attention_mask"] for op in f["features"]] for f in features], dtype=torch.long)
    if features[0]["features"][0]["token_type_ids"] is not None:
        token_type_ids = torch.tensor([[op["token_type_ids"] for op in f["features"]] for f in features],
                                      dtype=torch.long)
    else:
        token_type_ids = None
    labels = torch.tensor([f["label"] for f in features], dtype=torch.long)

    # List[List[List[Tuple[int, int]]]]
    sentence_spans_ls = [[op["sentence_spans"] for op in f["features"]] for f in features]
    max_sent_num = 0
    for f in sentence_spans_ls:
        f_max_sent_num = max(map(len, f))
        max_sent_num = max(f_max_sent_num, max_sent_num)

    sentence_spans = torch.zeros(data_num, option_num, max_sent_num, 2, dtype=torch.long).fill_(-1)
    for f_id, f in enumerate(sentence_spans_ls):
        for op_id, op in enumerate(f):
            f_op_sent_num = len(op)
            sentence_spans[f_id, op_id, :f_op_sent_num] = torch.tensor(op, dtype=torch.long)

    if token_type_ids is not None:
        return input_ids, attention_mask, token_type_ids, labels, sentence_spans
    else:
        return input_ids, attention_mask, labels, sentence_spans


def convert_examples_into_features(file_path, tokenizer, max_seq_length, num_workers = 16, suffix=''):
    tokenizer_name = tokenizer.__class__.__name__
    tokenizer_name = tokenizer_name.replace('TokenizerFast', '')
    tokenizer_name = tokenizer_name.replace('Tokenizer', '').lower()

    file_suffix = f"{tokenizer_name}_{max_seq_length}_{suffix}"
    cached_file_path = config.tensor_cache_path + file_suffix

    if os.path.exists(cached_file_path):
        print(f"Loading cached file from {cached_file_path}")
        examples, features, tensors = torch.load(cached_file_path)
        return examples, features, tensors

    examples = read_examples(file_path)

    with Pool(num_workers) as p:
        _annotate = partial(_convert_example_to_features, tokenizer=tokenizer, max_seq_length=max_seq_length)
        features = list(tqdm(
            p.imap(_annotate, examples, chunksize=32),
            total=len(examples),
            desc='converting examples to features:'
        ))

    print("Transform features into tensors...")
    tensors = _data_to_tensors(features)

    print(f"Saving processed features into {cached_file_path}.")
    if not os.path.exists(config.tensor_cache_path):
        os.makedirs(config.tensor_cache_path)
    torch.save((examples, features, tensors), cached_file_path)

    return examples, features, tensors

In [31]:
def forward_and_backward(model, inputs, config, scaler):
    if config.fp16 and scaler:
        with torch.cuda.amp.autocast():
            outputs = model(**inputs)
            loss = outputs["loss"]
    else:
        outputs = model(**inputs)
        loss = outputs["loss"]

    if config.gradient_accumulation_steps > 1:
        loss = loss / config.gradient_accumulation_steps

    if scaler:
        scaler.scale(loss).backward()
    else:
        loss.backward()

    return loss.item()

def batch_to_device(batch, device):
    batch_on_device = {}
    for k, v in batch.items():
        batch_on_device[k] = v.to(device)
    return batch_on_device

def train_model(config, train_dataset,val_dataset, model, tokenizer, start_global_step):
    output_path_split = config.output_path.split('/')
    log_dir = '/'.join([output_path_split[0], 'runs'] + output_path_split[1:])
    if config.writer:
        writer = SummaryWriter(log_dir=log_dir)
    else:
        writer = None

    train_loader = DataLoader(dataset=train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=config.train_batch_size,
                              collate_fn=collator,
                              num_workers=config.num_workers,
                              pin_memory=True,
                              prefetch_factor=config.prefetch_factor)
    
    val_loader = DataLoader(dataset=val_dataset,
                              sampler=RandomSampler(val_dataset),
                              batch_size=config.train_batch_size,
                              collate_fn=collator,
                              num_workers=config.num_workers,
                              pin_memory=True,
                              prefetch_factor=config.prefetch_factor)

    no_decay = ['bias', 'LayerNorm.weight', 'layer_norm.weight']
    grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if
                       (not any(nd in n for nd in no_decay)) and p.requires_grad],
            'weight_decay': config.weight_decay
        },
        {
            'params': [p for n, p in model.named_parameters() if (any(nd in n for nd in no_decay)) and p.requires_grad],
            'weight_decay': 0.0
        }
    ]

    total_steps = len(train_loader) // config.gradient_accumulation_steps * config.train_epochs

    optimizer = AdamW(grouped_parameters,
                      lr=config.learning_rate,
                      eps=config.adam_epsilon,
                      betas=config.adam_betas)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(total_steps * config.warm_up),
                                                num_training_steps=total_steps)

    if config.fp16 and config.device.type == 'cuda':
        scaler = GradScaler()
    else:
        scaler = None

    print(optimizer)
    print("-- Start Training --")
    print("  Num examples = ", len(train_dataset))
    print("  Num Epochs = ", config.train_epochs)
    print("  Batch size = ", config.train_batch_size)
    print("  Gradient Accumulation steps = ", config.gradient_accumulation_steps)
    print("  Total optimization steps = ", total_steps)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)

    for epoch in range(config.train_epochs):
        for step, batch in enumerate(train_loader):

            if global_step < start_global_step:
                if (step + 1) % config.gradient_accumulation_steps == 0:
                    scheduler.step()
                    global_step += 1
                continue
                
            model.train()
            batch = batch_to_device(batch, config.device)
            
            loss = forward_and_backward(model, batch, config, scaler)
            tr_loss += loss

            if (step + 1) % config.gradient_accumulation_steps == 0:

                if scaler:
                    scaler.unscale_(optimizer)

                if config.max_grad_norm:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

                if scaler:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

                scheduler.step()
                model.zero_grad(set_to_none=True)
                global_step += 1

                

                if config.logging_steps > 0 and global_step % config.logging_steps == 0:
                    if config.writer:
                        writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                        writer.add_scalar('loss', (tr_loss - logging_loss) / config.logging_steps, global_step)
                    else:
                        print('gb_step={}, loss={}'.format(global_step, (tr_loss - logging_loss) / config.logging_steps))
                    logging_loss = tr_loss

                if config.save_steps > 0 and global_step % config.save_steps == 0:
                    output_dir = os.path.join(config.output_path, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                        
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    print("Saving model checkpoint to ", output_dir)
                    
            if global_step >= config.max_step:
                break
        
        if global_step >= config.max_step:
            break
        model.eval()
        val_loss = 0.0
        val_acc = 0.0
        num_val_steps = 0
        correct_predictions = 0.0
        pred_list = []
        prob_list = []  
        
        for val_batch in (val_loader):
            batch = batch_to_device(val_batch, config.device)
            with torch.cuda.amp.autocast():
              with torch.no_grad():
                  outputs = model(**batch)
                  val_loss += outputs.loss.item()
                  num_val_steps = num_val_steps +1
                  predictions = outputs.logits.argmax(dim=-1)
                  targets = batch["labels"]
                  correct_predictions += (predictions == targets).sum().item()
        val_loss /= num_val_steps
        
        val_acc = correct_predictions / 500
        print("VAL ACCURACY",val_acc)
        print("val_loss",val_loss)
    return global_step, tr_loss / global_step

In [32]:
def train_main(start_global_step=0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config.device = device
    
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)

    tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name_or_path, cache_dir=config.pretrained_model_name_or_path_cache)
    model = RobertaForMultipleChoice.from_pretrained(config.pretrained_model_name_or_path, cache_dir=config.pretrained_model_name_or_path_cache)

    model.to(config.device)

    train_dataset, features = load_dataset(config, tokenizer=tokenizer, split='train')
    val_dataset, features_val = load_dataset(config, tokenizer=tokenizer, split='val')
    
    step, loss = train_model(config, train_dataset, val_dataset, model, tokenizer, start_global_step)
    print('Train finished, step: {}, loss: {}'.format(step, loss))

In [33]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2023-03-13 00:59:53'

In [34]:
train_main()

False


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading cached file from content/tensor/roberta_256_train
Loading cached file from content/tensor/roberta_256_val
AdamW (
Parameter Group 0
    betas: (0.9, 0.98)
    correct_bias: True
    eps: 1e-06
    initial_lr: 1e-05
    lr: 0.0
    weight_decay: 0.01

Parameter Group 1
    betas: (0.9, 0.98)
    correct_bias: True
    eps: 1e-06
    initial_lr: 1e-05
    lr: 0.0
    weight_decay: 0.0
)
-- Start Training --
  Num examples =  4638
  Num Epochs =  10
  Batch size =  1
  Gradient Accumulation steps =  6
  Total optimization steps =  7730
gb_step=100, loss=1.3906832191348075
gb_step=200, loss=1.3830486480891704
gb_step=300, loss=1.3946657808125018
gb_step=400, loss=1.3937036837637424
gb_step=500, loss=1.38407785192132
gb_step=600, loss=1.3237384892255069
gb_step=700, loss=1.2902826748788356
Saving model checkpoint to  content/output/checkpoint-773
VAL ACCURACY 0.436
val_loss 1.24694089114666
gb_step=800, loss=1.2378022559173405
gb_step=900, loss=1.2037188726197927
gb_step=1000, loss=

In [35]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2023-03-13 04:32:34'

In [36]:
# False
# Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'lm_head.decoder.bias']
# You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# Loading cached file from content/tensor/roberta_256_train
# Loading cached file from content/tensor/roberta_256_val
# AdamW (
# Parameter Group 0
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.01

# Parameter Group 1
#     betas: (0.9, 0.98)
#     correct_bias: True
#     eps: 1e-06
#     initial_lr: 1e-05
#     lr: 0.0
#     weight_decay: 0.0
# )
# -- Start Training --
#   Num examples =  4638
#   Num Epochs =  10
#   Batch size =  1
#   Gradient Accumulation steps =  6
#   Total optimization steps =  7730
# gb_step=100, loss=1.3906832191348075
# gb_step=200, loss=1.3830486480891704
# gb_step=300, loss=1.3946657808125018
# gb_step=400, loss=1.3937036837637424
# gb_step=500, loss=1.38407785192132
# gb_step=600, loss=1.3237384892255069
# gb_step=700, loss=1.2902826748788356
# Saving model checkpoint to  content/output/checkpoint-773
# VAL ACCURACY 0.436
# val_loss 1.24694089114666
# gb_step=800, loss=1.2378022559173405
# gb_step=900, loss=1.2037188726197927
# gb_step=1000, loss=1.0709503892436623
# gb_step=1100, loss=1.1575460922718048
# gb_step=1200, loss=1.1075375059619545
# gb_step=1300, loss=1.0915902483975515
# gb_step=1400, loss=1.0937564730830491
# gb_step=1500, loss=1.1201259839721025
# Saving model checkpoint to  content/output/checkpoint-1546
# VAL ACCURACY 0.562
# val_loss 1.021248332247138
# gb_step=1600, loss=0.8839860117039643
# gb_step=1700, loss=0.6966540498952963
# gb_step=1800, loss=0.7248349349077035
# gb_step=1900, loss=0.7467363401993498
# gb_step=2000, loss=0.6954545036122727
# gb_step=2100, loss=0.8221353476736113
# gb_step=2200, loss=0.688394755335903
# gb_step=2300, loss=0.6831466681133316
# Saving model checkpoint to  content/output/checkpoint-2319
# VAL ACCURACY 0.606
# val_loss 0.9276560444813222
# gb_step=2400, loss=0.4219084879282173
# gb_step=2500, loss=0.3300979009099092
# gb_step=2600, loss=0.2725311021212656
# gb_step=2700, loss=0.36256908578400726
# gb_step=2800, loss=0.3331815189065719
# gb_step=2900, loss=0.3602212211330743
# gb_step=3000, loss=0.31294935808199625
# Saving model checkpoint to  content/output/checkpoint-3092
# VAL ACCURACY 0.612
# val_loss 1.2056505223396505
# gb_step=3100, loss=0.2996267224210169
# gb_step=3200, loss=0.13064403417701215
# gb_step=3300, loss=0.14489557868756947
# gb_step=3400, loss=0.12651861046371324
# gb_step=3500, loss=0.1562492121116611
# gb_step=3600, loss=0.11562360857192289
# gb_step=3700, loss=0.14775275781069014
# gb_step=3800, loss=0.19922116898707828
# Saving model checkpoint to  content/output/checkpoint-3865
# VAL ACCURACY 0.606
# val_loss 1.6527001306686906
# gb_step=3900, loss=0.11783152809639887
# gb_step=4000, loss=0.05724837024125918
# gb_step=4100, loss=0.08328599952035347
# gb_step=4200, loss=0.0944014540564649
# gb_step=4300, loss=0.09052542202972745
# gb_step=4400, loss=0.06593119219420714
# gb_step=4500, loss=0.07895771362668257
# gb_step=4600, loss=0.09590211741502572
# Saving model checkpoint to  content/output/checkpoint-4638
# VAL ACCURACY 0.608
# val_loss 1.7362016987618958
# gb_step=4700, loss=0.057499595947415404
# gb_step=4800, loss=0.037112483877635896
# gb_step=4900, loss=0.039515550059386444
# gb_step=5000, loss=0.06370177432631863
# gb_step=5100, loss=0.04252046505283943
# gb_step=5200, loss=0.03971146982071787
# gb_step=5300, loss=0.05540029381877048
# gb_step=5400, loss=0.04560911384209703
# Saving model checkpoint to  content/output/checkpoint-5411
# VAL ACCURACY 0.632
# val_loss 2.0236645980290984
# gb_step=5500, loss=0.03800823604621201
# gb_step=5600, loss=0.02870803069612066
# gb_step=5700, loss=0.044941766361484954
# gb_step=5800, loss=0.03474681099190548
# gb_step=5900, loss=0.029943934856551096
# gb_step=6000, loss=0.02984334486577609
# gb_step=6100, loss=0.023299850600451463
# Saving model checkpoint to  content/output/checkpoint-6184
# VAL ACCURACY 0.62
# val_loss 2.1258305217671833
# gb_step=6200, loss=0.04583871823260324
# gb_step=6300, loss=0.027574690634696707
# gb_step=6400, loss=0.026231208402600716
# gb_step=6500, loss=0.01809466736759987
# gb_step=6600, loss=0.019695667388891707
# gb_step=6700, loss=0.018579683440211738
# gb_step=6800, loss=0.02311229538666339
# gb_step=6900, loss=0.02025991700936629
# Saving model checkpoint to  content/output/checkpoint-6957
# VAL ACCURACY 0.638
# val_loss 2.1296772076439283
# gb_step=7000, loss=0.013128395172352612
# gb_step=7100, loss=0.009796608216411186
# gb_step=7200, loss=0.00953967658502279
# gb_step=7300, loss=0.01439472481149096
# gb_step=7400, loss=0.008181649746939001
# gb_step=7500, loss=0.007883859690005011
# gb_step=7600, loss=0.00978013759836358
# gb_step=7700, loss=0.012991459028830832
# Saving model checkpoint to  content/output/checkpoint-7730
# VAL ACCURACY 0.63
# val_loss 2.2396614357486646
# Train finished, step: 7730, loss: 0.38654366853648475

In [41]:
def evaluate(config, split="val", checkpoint_name=''):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config.device = device
    tokenizer = AutoTokenizer.from_pretrained(config.output_path + '/' + checkpoint_name)
    model = RobertaForMultipleChoice.from_pretrained(config.output_path + '/' + checkpoint_name)

    model.to(config.device)

    dataset, features = load_dataset(config, tokenizer, split=split)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=config.eval_batch_size,
                                  collate_fn=collator)

    torch.cuda.empty_cache()
    print("***** Running evaluation {} on {} *****".format(split, checkpoint_name))
    print("  Num examples =", len(dataset))
    print("  Batch size =", config.eval_batch_size)

    model.eval()
    pred_list = []
    prob_list = []

    for batch in tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True):
        batch = batch_to_device(batch, config.device)
        with torch.cuda.amp.autocast():
            with torch.no_grad():
                outputs = model(**batch)
                probs = outputs["logits"].softmax(dim=-1).detach().float().cpu()
                prob, pred = probs.max(dim=-1)
                pred_list.extend(pred.tolist())
                prob_list.extend(prob.tolist())
  
    metric_log, results = model.get_eval_log(reset=True)
    print("****** Evaluation Results ******")
    print(metric_log)
    
    prediction_file = os.path.join(config.output_path, "{checkpoint_name}_eval_predictions.npy")
    np.save(prediction_file, pred_list)
    json.dump(prob_list, open(os.path.join(config.output_path, "{checkpoint_name}_eval_probs.json"), "w"))
    return results

In [42]:
evaluate(config, checkpoint_name='checkpoint-773')

False
Loading cached file from content/tensor/roberta_256_val
***** Running evaluation val on checkpoint-773 *****
  Num examples = 500
  Batch size = 1


Evaluating: 100%|██████████| 500/500 [00:32<00:00, 15.60it/s]

****** Evaluation Results ******
loss: 1.24694089114666	acc: 0.436





{'loss': 1.24694089114666, 'acc': 0.436}

In [43]:
import os, shutil
def make_archive(source, destination):
        base = os.path.basename(destination)
        name = base.split('.')[0]
        format = base.split('.')[1]
        archive_from = os.path.dirname(source)
        archive_to = os.path.basename(source.strip(os.sep))
        shutil.make_archive(name, format, archive_from, archive_to)
        shutil.move('%s.%s'%(name,format), destination)

make_archive('/content/content', '/content/drive/MyDrive/Thesis/Reclore/13march2023_res_optimizer_change.zip')

FileNotFoundError: ignored