In [None]:
!pip install -q transformers datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': 'drive/MyDrive/nlp_data/raw/train_data_for_bert.csv', 'test': 'drive/MyDrive/nlp_data/raw/test_data_for_bert.csv'})

Using custom data configuration default-330f6aaa3b98eea2


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-330f6aaa3b98eea2/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-330f6aaa3b98eea2/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['preprocessed_plot', 'action', 'sci-fi', 'comedy', 'horror', 'drama', 'animation', 'mystery', 'crime', 'fantasy', 'thriller', 'romance', 'adventure', 'biography'],
        num_rows: 24930
    })
    test: Dataset({
        features: ['preprocessed_plot', 'action', 'sci-fi', 'comedy', 'horror', 'drama', 'animation', 'mystery', 'crime', 'fantasy', 'thriller', 'romance', 'adventure', 'biography'],
        num_rows: 6233
    })
})

In [None]:
dataset['train'][0]

{'action': False,
 'adventure': False,
 'animation': False,
 'biography': False,
 'comedy': True,
 'crime': False,
 'drama': False,
 'fantasy': False,
 'horror': False,
 'mystery': False,
 'preprocessed_plot': "Tinto Brass receives letters containing real stories of women's erotic adventures which are subsequently transformed into short sex vignettes.",
 'romance': False,
 'sci-fi': False,
 'thriller': False}

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['preprocessed_plot']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['action',
 'sci-fi',
 'comedy',
 'horror',
 'drama',
 'animation',
 'mystery',
 'crime',
 'fantasy',
 'thriller',
 'romance',
 'adventure',
 'biography']

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["preprocessed_plot"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)



  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
encoded_dataset['train'][0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
tokenizer.decode(encoded_dataset['train'][0]['input_ids'])

"[CLS] tinto brass receives letters containing real stories of women's erotic adventures which are subsequently transformed into short sex vignettes. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
encoded_dataset['train'][0]['labels']

[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
[id2label[i] for i, label in enumerate(encoded_dataset['train'][0]['labels']) if label == 1.0]

['comedy']

In [None]:
encoded_dataset.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "action",
    "1": "sci-fi",
    "2": "comedy",
    "3": "horror",
    "4": "drama",
    "5": "animation",
    "6": "mystery",
    "7": "crime",
    "8": "fantasy",
    "9": "thriller",
    "10": "romance",
    "11": "adventure",
    "12": "biography"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "action": 0,
    "adventure": 11,
    "animation": 5,
    "biography": 

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput([('loss',
                           tensor(0.7645, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)),
                          ('logits',
                           tensor([[-0.2576,  0.2045, -0.2076,  0.7873, -0.5974,  0.4485,  0.3781,  0.2876,
                                     0.1439,  0.2107, -0.3950, -0.1006,  0.0937]],
                                  grad_fn=<AddmmBackward0>))])

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 24930
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15585


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.2942,0.25059,0.674595,0.780176,0.253109
2,0.249,0.201003,0.761928,0.837103,0.358002
3,0.2018,0.162779,0.823978,0.881059,0.474569
4,0.1705,0.130574,0.869138,0.908086,0.580064
5,0.1459,0.119198,0.886015,0.921223,0.626314


***** Running Evaluation *****
  Num examples = 24930
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-3117
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-3117/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-3117/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-3117/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-3117/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 24930
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-6234
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-6234/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-6234/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-6234/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/

TrainOutput(global_step=15585, training_loss=0.21939799339678717, metrics={'train_runtime': 4226.9599, 'train_samples_per_second': 29.489, 'train_steps_per_second': 3.687, 'total_flos': 8200008052876800.0, 'train_loss': 0.21939799339678717, 'epoch': 5.0})

In [None]:
pt_save_directory = "drive/MyDrive/nlp_data/pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

tokenizer config file saved in drive/MyDrive/nlp_data/pt_save_pretrained/tokenizer_config.json
Special tokens file saved in drive/MyDrive/nlp_data/pt_save_pretrained/special_tokens_map.json
Configuration saved in drive/MyDrive/nlp_data/pt_save_pretrained/config.json
Model weights saved in drive/MyDrive/nlp_data/pt_save_pretrained/pytorch_model.bin
