## Mejorar tokenización y performance

¿Qué pasa si en vez de tokenizar a 128 usamos el más largo del batch?

In [3]:

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [1]:
"""
Run sentiment experiments
"""
import os
import pathlib
import tempfile
import pandas as pd
from finetune_vs_scratch.model import load_model_and_tokenizer
from finetune_vs_scratch.preprocessing import preprocess
from transformers import Trainer, TrainingArguments
from datasets import Dataset, Value, ClassLabel, Features
from pysentimiento.tass import id2label, label2id
from pysentimiento.metrics import compute_metrics as compute_sentiment_metrics


project_dir = pathlib.Path("..")
data_dir = os.path.join(project_dir, "data")
sentiment_dir = os.path.join(data_dir, "sentiment")


def load_datasets(data_path=None, limit=None):
    """
    Load sentiment datasets
    """
    features = Features({
        'text': Value('string'),
        'lang': Value('string'),
        'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
    })
    data_path = data_path or os.path.join(sentiment_dir, "tass.csv")
    df = pd.read_csv(data_path)
    df["label"] = df["polarity"].apply(lambda x: label2id[x])
    df["text"] = df["text"].apply(lambda x: preprocess(x))

    train_dataset = Dataset.from_pandas(df[df["split"] == "train"], features=features)
    dev_dataset = Dataset.from_pandas(df[df["split"] == "dev"], features=features)
    test_dataset = Dataset.from_pandas(df[df["split"] == "test"], features=features)


    if limit:
        """
        Smoke test
        """
        print("\n\n", f"Limiting to {limit} instances")
        train_dataset = train_dataset.select(range(min(limit, len(train_dataset))))
        dev_dataset = dev_dataset.select(range(min(limit, len(dev_dataset))))
        test_dataset = test_dataset.select(range(min(limit, len(test_dataset))))


    return train_dataset, dev_dataset, test_dataset



In [8]:
def run(model_name, device, data_path=None, limit=None, epochs=5, batch_size=32, eval_batch_size=32, padding="max_length", **kwargs):
    """
    Run sentiment analysis experiments
    """
    print("Running sentiment experiments")

    model, tokenizer = load_model_and_tokenizer(model_name, num_labels=len(label2id), device=device)
    train_dataset, dev_dataset, test_dataset = load_datasets(data_path=data_path, limit=limit)

    def tokenize(batch):
        return tokenizer(batch['text'], padding=padding, truncation=True)

    accumulation_steps = 32 // batch_size

    train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
    dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
    test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)

    def format_dataset(dataset):
        dataset = dataset.map(lambda examples: {'labels': examples['label']})
        columns = ['input_ids', 'attention_mask', 'labels']
        if 'token_type_ids' in dataset.features:
            columns.append('token_type_ids')
        dataset.set_format(type='torch', columns=columns)
        return dataset

    train_dataset = format_dataset(train_dataset)
    dev_dataset = format_dataset(dev_dataset)
    test_dataset = format_dataset(test_dataset)


    output_path = tempfile.mkdtemp()
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=eval_batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_ratio=0.1,
        evaluation_strategy="epoch",
        do_eval=False,
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        **kwargs,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=lambda x: compute_sentiment_metrics(x, id2label=id2label),
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
    )

    trainer.train()

    test_results = trainer.evaluate(test_dataset)

    os.system(f"rm -Rf {output_path}")
    return test_results


In [9]:
%%time
model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
run(model_name, "cuda", epochs=1, padding="max_length")

Running sentiment experiments


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

  0%|          | 0/151 [00:00<?, ?ba/s]

  0%|          | 0/77 [00:00<?, ?ba/s]

  0%|          | 0/227 [00:00<?, ?ba/s]

  0%|          | 0/4802 [00:00<?, ?ex/s]

  0%|          | 0/2443 [00:00<?, ?ex/s]

  0%|          | 0/7264 [00:00<?, ?ex/s]

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running training *****
  Num examples = 4802
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 151


Epoch,Training Loss,Validation Loss,Neg F1,Neg Precision,Neg Recall,Neu F1,Neu Precision,Neu Recall,Pos F1,Pos Precision,Pos Recall,Micro F1,Macro F1,Macro Precision,Macro Recall,Acc
1,No log,0.78937,0.701587,0.70607,0.697161,0.518375,0.530343,0.506936,0.698962,0.676944,0.722461,0.642652,0.639642,0.637786,0.642186,0.642652


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running Evaluation *****
  Num examples = 2443
  Batch size = 32
Saving model checkpoint to /tmp/tmpffo2c7rw/checkpoint-151
Configuration saved in /tmp/tmpffo2c7rw/checkpoint-151/config.json
Model weights saved in /tmp/tmpffo2c7rw/checkpoint-151/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /tmp/tmpffo2c7rw/checkpoint-151 (score: 0.639641523361206).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running Evaluation *****
  Num examples = 7264
  Batch size = 32


CPU times: user 1min 25s, sys: 30.2 s, total: 1min 55s
Wall time: 1min 50s


{'eval_loss': 0.7472413182258606,
 'eval_neg_f1': 0.7282453637660485,
 'eval_neg_precision': 0.7650805545147995,
 'eval_neg_recall': 0.6947941476692753,
 'eval_neu_f1': 0.5232530407822561,
 'eval_neu_precision': 0.49259092950157163,
 'eval_neu_recall': 0.5579857578840285,
 'eval_pos_f1': 0.7391580283477893,
 'eval_pos_precision': 0.7377533783783784,
 'eval_pos_recall': 0.7405680373039424,
 'eval_micro_f1': 0.6726321585903083,
 'eval_macro_f1': 0.6635521650314331,
 'eval_macro_precision': 0.6651416420936584,
 'eval_macro_recall': 0.6644493341445923,
 'eval_acc': 0.6726321585903083,
 'eval_runtime': 27.6482,
 'eval_samples_per_second': 262.729,
 'eval_steps_per_second': 8.21,
 'epoch': 1.0}

In [14]:
from transformers import DataCollatorWithPadding

def run_with_collator(model_name, device, data_path=None, limit=None, epochs=5, batch_size=32, eval_batch_size=32, padding=False, **kwargs):
    """
    Run sentiment analysis experiments
    """
    print("Running sentiment experiments")

    model, tokenizer = load_model_and_tokenizer(model_name, num_labels=len(label2id), device=device)
    train_dataset, dev_dataset, test_dataset = load_datasets(data_path=data_path, limit=limit)

    def tokenize(batch):
        return tokenizer(batch['text'], padding=padding, truncation=True)

    accumulation_steps = 32 // batch_size

    train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
    dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
    test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)


    output_path = tempfile.mkdtemp()
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=eval_batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_ratio=0.1,
        evaluation_strategy="epoch",
        do_eval=False,
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        **kwargs,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=DataCollatorWithPadding(tokenizer, padding="longest"),
        compute_metrics=lambda x: compute_sentiment_metrics(x, id2label=id2label),
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
    )

    trainer.train()

    test_results = trainer.evaluate(test_dataset)

    os.system(f"rm -Rf {output_path}")
    return test_results


In [15]:
run_with_collator(model_name, "cuda", epochs=1)

Running sentiment experiments


loading configuration file https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased/resolve/main/config.json from cache at /home/jmperez/.cache/huggingface/transformers/2416dab24674c27b5521594d6aa0929fc843a024c96711b1b5015cdff867291f.afa3630b664b4bd3e82d41660bdb96ec13236bbceadb0ae7c45c7c19f58652c7
Model config BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_

  0%|          | 0/151 [00:00<?, ?ba/s]

  0%|          | 0/77 [00:00<?, ?ba/s]

  0%|          | 0/227 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running training *****
  Num examples = 4802
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 151


Epoch,Training Loss,Validation Loss,Neg F1,Neg Precision,Neg Recall,Neu F1,Neu Precision,Neu Recall,Pos F1,Pos Precision,Pos Recall,Micro F1,Macro F1,Macro Precision,Macro Recall,Acc
1,No log,0.794383,0.708629,0.684985,0.733964,0.503674,0.535511,0.47541,0.686399,0.676389,0.69671,0.639378,0.632901,0.632295,0.635361,0.639378


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running Evaluation *****
  Num examples = 2443
  Batch size = 32
Saving model checkpoint to /tmp/tmp80o8hioa/checkpoint-151
Configuration saved in /tmp/tmp80o8hioa/checkpoint-151/config.json
Model weights saved in /tmp/tmp80o8hioa/checkpoint-151/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /tmp/tmp80o8hioa/checkpoint-151 (score: 0.6329007744789124).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lang, text.
***** Running Evaluation *****
  Num examples = 7264
  Batch size = 32


{'eval_loss': 0.7470980286598206,
 'eval_neg_f1': 0.7365911799761621,
 'eval_neg_precision': 0.7372188139059305,
 'eval_neg_recall': 0.7359646138142225,
 'eval_neu_f1': 0.5104090129806516,
 'eval_neu_precision': 0.49220595181861126,
 'eval_neu_recall': 0.5300101729399797,
 'eval_pos_f1': 0.7274715660542431,
 'eval_pos_precision': 0.7514685946678716,
 'eval_pos_recall': 0.7049597286986011,
 'eval_micro_f1': 0.6701541850220264,
 'eval_macro_f1': 0.6581572890281677,
 'eval_macro_precision': 0.6602978110313416,
 'eval_macro_recall': 0.656978189945221,
 'eval_acc': 0.6701541850220264,
 'eval_runtime': 8.6203,
 'eval_samples_per_second': 842.659,
 'eval_steps_per_second': 26.333,
 'epoch': 1.0}