## Train BERT model

In this notebook we will train a BERT model for Sentiment Analysis in English

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
from datasets import Dataset, Value, ClassLabel, Features
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento.semeval import load_datasets


train_dataset, dev_dataset, test_dataset = load_datasets()


39716 9929 20632


In [2]:
%load_ext autoreload
%autoreload 2
import os
from pysentimiento.tass import load_model
from pysentimiento.emotion.datasets import id2label, label2id

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

base_model = "bert-base-uncased"

model, tokenizer = load_model(base_model, 
    id2label=id2label, 
    label2id=label2id
)




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading model bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
train_dataset = train_dataset.select(range(1000))
dev_dataset = dev_dataset.select(range(500))
test_dataset = test_dataset.select(range(500))

In [4]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 32

eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [5]:
def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['label']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [6]:
from transformers import TrainingArguments, Trainer
from pysentimiento.metrics import compute_metrics
from pysentimiento.semeval.datasets import id2label

epochs = 3

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=lambda x: compute_metrics(x, id2label=id2label),
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Neg F1,Neg Precision,Neg Recall,Neu F1,Neu Precision,Neu Recall,Pos F1,Pos Precision,Pos Recall,Macro F1,Macro Precision,Macro Recall,Acc
1,No log,0.975051,0.0,0.0,0.0,0.626263,0.598456,0.65678,0.627635,0.556017,0.72043,0.417966,0.384824,0.45907,0.578
2,No log,0.832615,0.072289,0.6,0.038462,0.65362,0.607273,0.707627,0.679803,0.627273,0.741935,0.468571,0.611515,0.496008,0.616
3,No log,0.844396,0.503937,0.653061,0.410256,0.665245,0.669528,0.661017,0.688119,0.637615,0.747312,0.6191,0.653401,0.606195,0.654


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=96, training_loss=0.8679145177205404, metrics={'train_runtime': 61.455, 'train_samples_per_second': 1.562, 'total_flos': 0, 'epoch': 3.0})

In [7]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.6224976778030396,
 'eval_neg_f1': 0.5299145299145299,
 'eval_neg_precision': 0.5636363636363636,
 'eval_neg_recall': 0.5,
 'eval_neu_f1': 0.6826666666666665,
 'eval_neu_precision': 0.757396449704142,
 'eval_neu_recall': 0.6213592233009708,
 'eval_pos_f1': 0.8425196850393701,
 'eval_pos_precision': 0.7753623188405797,
 'eval_pos_recall': 0.9224137931034483,
 'eval_macro_f1': 0.6850336194038391,
 'eval_macro_precision': 0.6987983584403992,
 'eval_macro_recall': 0.6812576651573181,
 'eval_acc': 0.746,
 'eval_runtime': 2.5979,
 'eval_samples_per_second': 192.467,
 'epoch': 3.0}

In [8]:
path = "../models/test"
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('../models/test/tokenizer_config.json',
 '../models/test/special_tokens_map.json',
 '../models/test/vocab.txt',
 '../models/test/added_tokens.json',
 '../models/test/tokenizer.json')