In [None]:
!pip install datasets
# !pip install accelerate -U
from datasets import load_dataset, load_from_disk
import torch
import os
import numpy as np
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import get_scheduler
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn import functional as F
import json
from torch.nn import CrossEntropyLoss
# Mount to google drive either click it or add a block cell
# Change it to your google drive path where this notebook located.
drive_path = '/content/drive/MyDrive/Projects/CryptoniteAnalysis/Baselines/Seq2Seq'
os.chdir(drive_path)

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194

# Define Training Cycle
Because we want to make sure that the output of the model is exactly the number of words and letters (the exact shape as "enumeration" provided), we need to define a custom training cycle.

## Record Functions

In [None]:
RECORD_HYPERPARAMETERS = 'record_hyperparameters'
RECORD_TRAIN_LOSS_AND_ACCURACY = 'record_train_loss_and_accuracy'
RECORD_TEST_LOSS_AND_ACCURACY = 'record_test_loss_and_accuracy'
RECORD_MODEL = 'record_model'

def write_results(output_dir, result_type, results, **kwargs):
    # write the hyper parameter under output_dir
    if result_type == RECORD_HYPERPARAMETERS:
        hyper_parameters_file = os.path.join(output_dir, 'hyper_parameters.json')
        with open(hyper_parameters_file, 'w') as f:
            json.dump(hyper_parameters, f)
            return
    # write the train and validate results under output_dir
    if result_type == RECORD_TRAIN_LOSS_AND_ACCURACY:
        json_file = os.path.join(output_dir,f'train_metrics.json')
        if os.path.exists(json_file):
            with open(json_file, 'r') as file:
                data = json.load(file)
        else:
            # Initialize data as an empty dictionary or appropriate structure
            data = []

        # Append the results to the existing data
        data.append(results)
        with open(json_file, 'w') as f:
            json.dump(data, f, indent=4)
            return

    # write the validate and test results under output_dir
    if result_type == RECORD_TEST_LOSS_AND_ACCURACY:
        json_file = os.path.join(output_dir, f'validate_and_test_metrics.json')
        if os.path.exists(json_file):
            with open(json_file, 'r') as file:
                data = json.load(file)
        else:
            # Initialize data as an empty dictionary or appropriate structure
            data = []

        # Append the results to the existing data
        data.append(results)
        with open(json_file, 'w') as f:
            json.dump(data, f, indent=4)
            return

    # store the model under output_dir
    if result_type == RECORD_MODEL:
        results.save_pretrained(output_dir)
        return


## Train and eval Functions

2024.07.24 There is problem with loss and accuracy: it counted the loss of padded tokens, but we don't want to calculate the loss of padded tokens: try dynamic padding? Or customize loss function.....

In [None]:
EVAL_SUBSAMPLE_SIZE = 100
VAL_SET_SIZE = 1000
TEST_SET_SIZE = 1000

EVAL_PER_STEP = 100

def calculate_accuracy(logits, labels, tokenizer):
    predictions = torch.argmax(logits, dim=-1)
    # flatten the predictions and labels
    predictions = predictions.view(-1)
    labels = labels.view(-1)
    # calculate correct predictions
    correct_labels, total_labels = 0, 0
    for i in range(len(predictions)):
        if predictions[i] == tokenizer.pad_token_id and labels[i] == tokenizer.pad_token_id:
            continue
        if predictions[i] == labels[i]:
            correct_labels += 1
        total_labels += 1
    accuracy = correct_labels / total_labels
    return accuracy

def customize_loss_and_accuracy(outputs, target, tokenizer):
    '''
    Notice that the output logits are not necessarily the same length as the labels.
    The output logits are the same length as the input length. It's in size of (batch_num, seq_len, vocab_size)
    The labels are of the length of how much we padded them, here we padded the labels to be the same length as the sequences.
    We first padded each sequence to length of 40, and we padded the labels to length of 40.
    What would happen if the logits are not the same length as the labels? Can we use CrossEntropyLoss?
    '''
    # make the input and target the correct size (input is (batch* seq_len, dictionary_size), output is (batch*seq_len))
    loss = F.cross_entropy(input=outputs.logits.view(-1, outputs.logits.size(-1)), target=target.view(-1))
    accuracy = calculate_accuracy(logits=outputs.logits, labels=target, tokenizer=tokenizer)
    return loss, accuracy

def train_batch(model, tokenizer, epoch, step, batch, device, optimizer, scheduler, epoch_dir):
    # set model to train mode
    model.train()

    # put everything on the right device
    batch =  {k: v.to(device) for k, v in batch.items()}
    batch_size = batch['labels'].shape[0]

    # clear gradients, same old as usual
    optimizer.zero_grad()

    # forward pass, need to customize
    outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'])

    # outputs.loss might be problematic because of the NllLossBackward0 without softmax, should use nn.CrossEntropy
    loss, accuracy = customize_loss_and_accuracy(outputs, target=batch['labels'], tokenizer=tokenizer)

    # back propagation
    loss.backward()
    optimizer.step()

    # scheduler adjust lr
    scheduler.step()

    # record the train loss and accuracy
    record = {"evaluate_set": 'train', "epoch":epoch, "batch":step,
              "avg_loss":loss.item()/batch_size, "accuracy":accuracy, 'subsample_size':"None"}
    print(record)
    # WRITE: save the result for this epoch
    write_results(epoch_dir, result_type=RECORD_TRAIN_LOSS_AND_ACCURACY, results=record)

    return


def evaluate_model(model, tokenizer, epoch, step, dataloaders, device, subsample_size, evaluate_set, epoch_dir):
    '''evaluate means validate or test'''
    # set model to eval mode
    model.eval()
    # calculate number of samples being evaluated
    total_validated_samples = 0
    # calculate total loss and total number of correct labels (weighted acuracy)
    total_loss = 0
    total_accurate = 0
    # turn off grad computation
    with torch.no_grad():
        # evaluate batch by batch
        for batch in dataloaders[evaluate_set]:
            # terminate the process if we are subsampling
            if total_validated_samples > subsample_size:
                break

            # put everything on the right device
            batch =  {k: v.to(device) for k, v in batch.items()}
            batch_size = batch['labels'].shape[0]

            # forward pass in the model
            outputs = model(**batch)

            # accumulate loss and accuracy
            loss, accuracy = customize_loss_and_accuracy(outputs, target=batch['labels'], tokenizer=tokenizer)
            total_loss += loss.item()
            total_accurate += accuracy * batch_size
            total_validated_samples += batch_size


    # calculate the loss and accuracy
    average_loss = total_loss/total_validated_samples
    accuracy = total_accurate/total_validated_samples

    # record the loss and accuracy
    record = {"evaluate_set": evaluate_set, "epoch":epoch, "batch":step,
              "avg_loss": average_loss, 'accuracy': accuracy, 'subsample_size': subsample_size}
    print(record)

    # write to file
    write_results(epoch_dir, result_type=RECORD_TEST_LOSS_AND_ACCURACY, results=record)
    return


def train_epoch(model, tokenizer, epoch, dataloaders, device, optimizer, scheduler, epoch_dir):
    # prepare output dir
    if not os.path.exists(epoch_dir):
        os.makedirs(epoch_dir)

    # evaluate at the beginning of the training
    evaluate_model(model=model, tokenizer=tokenizer, epoch=epoch, step=0, dataloaders=dataloaders, device=device, subsample_size=VAL_SET_SIZE, evaluate_set='validation', epoch_dir=epoch_dir)
    evaluate_model(model=model, tokenizer=tokenizer, epoch=epoch, step=0, dataloaders=dataloaders, device=device, subsample_size=TEST_SET_SIZE, evaluate_set='test', epoch_dir=epoch_dir)

    for step, batch in enumerate(dataloaders['train']):
        # train the batch
        train_batch(model, tokenizer, epoch, step, batch, device, optimizer, scheduler, epoch_dir)

        # validate the model once every 100 steps
        if step % EVAL_PER_STEP == 0:
            evaluate_model(model, tokenizer, epoch, step, dataloaders, device, subsample_size=EVAL_SUBSAMPLE_SIZE, evaluate_set='validation', epoch_dir=epoch_dir)

    # save the model
    write_results(epoch_dir, result_type=RECORD_MODEL, results=model)
    return


def train_model(model, tokenizer, output_dir, dataloaders, optimizer, scheduler, device, hyper_parameters):
    # get device
    model.to(device)

    # number of epochs
    num_train_epochs = hyper_parameters['num_train_epochs']

    # Create the subdirectory for the hyperparameters: this directory is where we will save the result of trainning
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # WRITE hyperparameter to subdirectory
    write_results(output_dir, result_type=RECORD_HYPERPARAMETERS, results=hyper_parameters)


    # train the model on the hyper parameters
    for epoch in range(num_train_epochs):
        epoch_dir = os.path.join(output_dir, f"epoch={epoch}")
        train_epoch(model, tokenizer, epoch, dataloaders, device, optimizer, scheduler, epoch_dir)

    # final evaluation
    evaluate_model(model=model, tokenizer=tokenizer, epoch=num_train_epochs, step="STOP", dataloaders=dataloaders, device=device, subsample_size=VAL_SET_SIZE, evaluate_set='validation', epoch_dir=epoch_dir)
    evaluate_model(model=model, tokenizer=tokenizer, epoch=num_train_epochs, step="STOP", dataloaders=dataloaders, device=device, subsample_size=TEST_SET_SIZE, evaluate_set='test', epoch_dir=epoch_dir)

    return

# BART-base
todo: label is tokenized output

In [None]:
# load the preprocessed dataset
tokenized_dataset_fp = 'ProcessedDatasets/bart-base/'
tokenized_datasets = load_from_disk(tokenized_dataset_fp)
tokenized_datasets.set_format("torch")
# tokenized_datasets = tokenized_datasets.filter(lambda x: x['enumeration'] == '(9)')
tokenized_datasets = tokenized_datasets.remove_columns(['enumeration'])

# for testing purposes
# n = 16
# tokenized_datasets['test'] = tokenized_datasets['test'].select(range(n))
# tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(n))
# tokenized_datasets['train'] = tokenized_datasets['train'].select(range(n))

In [None]:
# define model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# define hyperparameters
per_device_train_batch_size = 16
learning_rate = 5e-05
num_train_epochs = 5

# define the best hyper parameter
hyper_parameters = {
            'learning_rate': learning_rate,
            'per_device_train_batch_size': per_device_train_batch_size,
            'num_train_epochs': num_train_epochs
        }

# defining the output directory
output_dir = f'TrainingData/bart-base/epoch={num_train_epochs}_batch={per_device_train_batch_size}_lr={learning_rate}/'

# initialize dataloaders
dataloaders = {}
dataloaders['train'] = DataLoader(tokenized_datasets['train'], batch_size=per_device_train_batch_size, shuffle=True)
dataloaders['test'] = DataLoader(tokenized_datasets['test'], batch_size=per_device_train_batch_size)
dataloaders['validation'] = DataLoader(tokenized_datasets['validation'], batch_size=per_device_train_batch_size, shuffle=True)  # shuffle because we want to subsample

# initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# initialize scheduler
# calculate the arguments for shceduler: it depends on the sample size, batch size and epochs
num_training_steps = len(tokenized_datasets['train'])
lr_scheduler_type = 'linear'
lr_scheduler_kwargs = {'optimizer':optimizer,
                        'num_warmup_steps':int(0.1 * num_training_steps),
                        'num_training_steps':int((num_training_steps/per_device_train_batch_size) * num_train_epochs)}
scheduler = get_scheduler(lr_scheduler_type, **lr_scheduler_kwargs)

# define training parameters
training_parameters = {
    'model': model,
    'tokenizer': tokenizer,
    'output_dir': output_dir,
    'dataloaders': dataloaders,
    'optimizer':optimizer,
    'scheduler':scheduler,
    'device':torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    'hyper_parameters':hyper_parameters
}

# train the model
train_model(**training_parameters)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24476, 'avg_loss': 0.024961398914456367, 'accuracy': 0.4, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24477, 'avg_loss': 0.02287379838526249, 'accuracy': 0.4027777777777778, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24478, 'avg_loss': 0.02401919476687908, 'accuracy': 0.4520547945205479, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24479, 'avg_loss': 0.023517075926065445, 'accuracy': 0.4305555555555556, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24480, 'avg_loss': 0.022578269243240356, 'accuracy': 0.44285714285714284, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24481, 'avg_loss': 0.022079134359955788, 'accuracy': 0.4583333333333333, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 24482, 'avg_loss': 0.0237238

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24476, 'avg_loss': 0.02378944680094719, 'accuracy': 0.4444444444444444, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24477, 'avg_loss': 0.01906847581267357, 'accuracy': 0.5, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24478, 'avg_loss': 0.02238433249294758, 'accuracy': 0.4383561643835616, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24479, 'avg_loss': 0.01584937795996666, 'accuracy': 0.5507246376811594, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24480, 'avg_loss': 0.01869957521557808, 'accuracy': 0.43283582089552236, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24481, 'avg_loss': 0.016935264691710472, 'accuracy': 0.49230769230769234, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 1, 'batch': 24482, 'avg_loss': 0.019010020

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24476, 'avg_loss': 0.011892314068973064, 'accuracy': 0.5970149253731343, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24477, 'avg_loss': 0.016523076221346855, 'accuracy': 0.5147058823529411, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24478, 'avg_loss': 0.01561446487903595, 'accuracy': 0.5074626865671642, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24479, 'avg_loss': 0.017188258469104767, 'accuracy': 0.5217391304347826, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24480, 'avg_loss': 0.018144527450203896, 'accuracy': 0.48, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24481, 'avg_loss': 0.0185911376029253, 'accuracy': 0.5072463768115942, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 2, 'batch': 24482, 'avg_loss': 0.02318862

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24476, 'avg_loss': 0.011199532076716423, 'accuracy': 0.6086956521739131, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24477, 'avg_loss': 0.013826698064804077, 'accuracy': 0.5915492957746479, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24478, 'avg_loss': 0.013066035695374012, 'accuracy': 0.5588235294117647, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24479, 'avg_loss': 0.012703316286206245, 'accuracy': 0.5857142857142857, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24480, 'avg_loss': 0.013630655594170094, 'accuracy': 0.5671641791044776, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24481, 'avg_loss': 0.014333566650748253, 'accuracy': 0.5416666666666666, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 3, 'batch': 24482, 'avg_

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24476, 'avg_loss': 0.013253453187644482, 'accuracy': 0.6486486486486487, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24477, 'avg_loss': 0.010188904590904713, 'accuracy': 0.6197183098591549, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24478, 'avg_loss': 0.013523970730602741, 'accuracy': 0.6153846153846154, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24479, 'avg_loss': 0.0072675892151892185, 'accuracy': 0.8088235294117647, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24480, 'avg_loss': 0.014964334666728973, 'accuracy': 0.547945205479452, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24481, 'avg_loss': 0.009558496996760368, 'accuracy': 0.6521739130434783, 'subsample_size': 'None'}
{'evaluate_set': 'train', 'epoch': 4, 'batch': 24482, 'avg_

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'evaluate_set': 'validation', 'epoch': 5, 'batch': 'STOP', 'avg_loss': 0.02697526104748249, 'accuracy': 0.42968380359254865, 'subsample_size': 1000}


In [None]:
from google.colab import runtime
runtime.unassign()

## BART-large-cnn

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# define model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# define hyperparameters
per_device_train_batch_size = 16
learning_rate = 5e-05
num_train_epochs = 1

# define the best hyper parameter
hyper_parameters = {
            'learning_rate': learning_rate,
            'per_device_train_batch_size': per_device_train_batch_size,
            'num_train_epochs': num_train_epochs
        }

# defining the output directory
output_dir = f'TrainingData/bart-large-cnn/epoch={num_train_epochs}_batch={per_device_train_batch_size}_lr={learning_rate}/'

# initialize dataloaders
dataloaders = {}
dataloaders['train'] = DataLoader(tokenized_datasets['train'], batch_size=per_device_train_batch_size, shuffle=True)
dataloaders['test'] = DataLoader(tokenized_datasets['test'], batch_size=per_device_train_batch_size)
dataloaders['validation'] = DataLoader(tokenized_datasets['validation'], batch_size=per_device_train_batch_size, shuffle=True)  # shuffle because we want to subsample

# initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# initialize scheduler
# calculate the arguments for shceduler: it depends on the sample size, batch size and epochs
num_training_steps = len(tokenized_datasets['train'])
lr_scheduler_type = 'linear'
lr_scheduler_kwargs = {'optimizer':optimizer,
                        'num_warmup_steps':int(0.1 * num_training_steps),
                        'num_training_steps':int((num_training_steps/per_device_train_batch_size) * num_train_epochs)}
scheduler = get_scheduler(lr_scheduler_type, **lr_scheduler_kwargs)

# define training parameters
training_parameters = {
    'model': model,
    'tokenizer': tokenizer,
    'output_dir': output_dir,
    'dataloaders': dataloaders,
    'optimizer':optimizer,
    'scheduler':scheduler,
    'device':torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    'hyper_parameters':hyper_parameters
}

# train the model
train_model(**training_parameters)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

{'evaluate_set': 'validation', 'epoch': 0, 'batch': 0, 'avg_loss': 0.7415982484817505, 'accuracy': 0.0046875, 'subsample_size': 1000}
{'evaluate_set': 'test', 'epoch': 0, 'batch': 0, 'avg_loss': 0.7468017339706421, 'accuracy': 0.0046875, 'subsample_size': 1000}
{'evaluate_set': 'train', 'epoch': 0, 'batch': 0, 'avg_loss': 0.7829321622848511, 'accuracy': 0.0125, 'subsample_size': 'None'}


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'evaluate_set': 'validation', 'epoch': 0, 'batch': 0, 'avg_loss': 0.7415981888771057, 'accuracy': 0.0046875, 'subsample_size': 100}
{'evaluate_set': 'validation', 'epoch': 1, 'batch': 'STOP', 'avg_loss': 0.7415981292724609, 'accuracy': 0.0046875, 'subsample_size': 1000}
{'evaluate_set': 'test', 'epoch': 1, 'batch': 'STOP', 'avg_loss': 0.7468017339706421, 'accuracy': 0.0046875, 'subsample_size': 1000}


In [None]:
from google.colab import runtime
runtime.unassign()