In [1]:
!pip install transformers==4.24 cache_decorator pytorch_lightning==1.6.3 torchmetrics==0.7.0

Collecting transformers==4.24
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cache_decorator
  Downloading cache_decorator-2.1.15.tar.gz (35 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch_lightning==1.6.3
  Downloading pytorch_lightning-1.6.3-py3-none-any.whl (584 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.0/584.0 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics==0.7.0
  Downloading torchmetrics-0.7.0-py3-none-any.whl (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.6/396.6 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0 (from transformers==4.24)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/history')

In [4]:
from paragraph_models_utils import (
    load_data, preprocess_data
)
from paragraph_models import (
    FreezeCallback, MultiTaskLearningModel, WikiDataModule,
    BiLSTM_clf, Bert_clf
)
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

seed_everything(42, workers=True)

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


42

In [5]:
import os
import torch
import json
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast

spacy.prefer_gpu()
# Modello small in inglese di SpaCy per testo scritto (web).
# I modelli medium e large sono piu' grossi e lenti, ma hanno performance
# solo leggermente migliori.
# Anche per Data Analytics e' stato usato questo modello.
nlp = spacy.load("en_core_web_sm")

with open(
    f"/content/drive/MyDrive/history/datasets/wiki/wiki_dataset.json",
    encoding="utf-8"
) as f_in:
    dataset = json.load(f_in)

In [6]:
# Texts sono i token di ogni paragrafo estratti tramite BERT.
# I tags specificano se i token sono storici o meno.
# Le label specificano invece se i paragrafi sono storici o meno.
texts, tags, labels = load_data(dataset, nlp)

train_texts, test_texts, train_tags, test_tags, train_labels, test_labels = train_test_split(
    texts, tags, labels, test_size=0.2, random_state=42, stratify=labels
)
train_texts, valid_texts, train_tags, valid_tags, train_labels, valid_labels = train_test_split(
    train_texts, train_tags, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

# BERT uncased e' stato addestrato su testo convertito prima in lowercase.
# BERT cased e' stato addestrato sul testo originale.
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

encodings, tokens_labels, labels, tag2idx, idx2tag = preprocess_data(
    train_texts.tolist(), train_tags.tolist(),
    train_labels.tolist(), tokenizer, padding="max_length"
)

if not os.path.exists("/content/drive/MyDrive/history/datasets/wiki/train/"):
    os.makedirs("/content/drive/MyDrive/history/datasets/wiki/train/")
if not os.path.exists("/content/drive/MyDrive/history/datasets/wiki/valid/"):
    os.makedirs("/content/drive/MyDrive/history/datasets/wiki/valid/")
if not os.path.exists("/content/drive/MyDrive/history/datasets/wiki/test/"):
    os.makedirs("/content/drive/MyDrive/history/datasets/wiki/test/")

torch.save(encodings.input_ids, "/content/drive/MyDrive/history/datasets/wiki/train/input_ids.pkl")
torch.save(encodings.attention_mask, "/content/drive/MyDrive/history/datasets/wiki/train/attention_mask.pkl")
torch.save(tokens_labels, "/content/drive/MyDrive/history/datasets/wiki/train/tokens_labels.pkl")
torch.save(labels, "/content/drive/MyDrive/history/datasets/wiki/train/labels.pkl")
torch.save(tag2idx, "/content/drive/MyDrive/history/datasets/wiki/train/tag2idx.pkl")
torch.save(idx2tag, "/content/drive/MyDrive/history/datasets/wiki/train/idx2tag.pkl")

encodings, tokens_labels, labels, tag2idx, idx2tag = preprocess_data(
    valid_texts.tolist(), valid_tags.tolist(),
    valid_labels.tolist(), tokenizer, padding="max_length"
)
torch.save(encodings.input_ids, "/content/drive/MyDrive/history/datasets/wiki/valid/input_ids.pkl")
torch.save(encodings.attention_mask, "/content/drive/MyDrive/history/datasets/wiki/valid/attention_mask.pkl")
torch.save(tokens_labels, "/content/drive/MyDrive/history/datasets/wiki/valid/tokens_labels.pkl")
torch.save(labels, "/content/drive/MyDrive/history/datasets/wiki/valid/labels.pkl")
torch.save(tag2idx, "/content/drive/MyDrive/history/datasets/wiki/valid/tag2idx.pkl")
torch.save(idx2tag, "/content/drive/MyDrive/history/datasets/wiki/valid/idx2tag.pkl")

encodings, tokens_labels, labels, tag2idx, idx2tag = preprocess_data(
    test_texts.tolist(), test_tags.tolist(),
    test_labels.tolist(), tokenizer, padding="max_length"
)
torch.save(encodings.input_ids, "/content/drive/MyDrive/history/datasets/wiki/test/input_ids.pkl")
torch.save(encodings.attention_mask, "/content/drive/MyDrive/history/datasets/wiki/test/attention_mask.pkl")
torch.save(tokens_labels, "/content/drive/MyDrive/history/datasets/wiki/test/tokens_labels.pkl")
torch.save(labels, "/content/drive/MyDrive/history/datasets/wiki/test/labels.pkl")
torch.save(tag2idx, "/content/drive/MyDrive/history/datasets/wiki/test/tag2idx.pkl")
torch.save(idx2tag, "/content/drive/MyDrive/history/datasets/wiki/test/idx2tag.pkl")

Creating tags:   0%|          | 0/12876 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  arr = asarray(arr)


Adjusting tags to encodings: 0it [00:00, ?it/s]

Adjusting tags to encodings: 0it [00:00, ?it/s]

Adjusting tags to encodings: 0it [00:00, ?it/s]

In [7]:
if not os.path.exists("/content/drive/MyDrive/history/checkpoints/"):
    os.makedirs("/content/drive/MyDrive/history/checkpoints/")

In [8]:
max_epochs = 10

### Multi-Task Learning model

In [9]:
checkpoint_callback = ModelCheckpoint(
    dirpath="/content/drive/MyDrive/history/checkpoints/mtl",
    every_n_epochs=1
)

freeze_callback = FreezeCallback()

In [10]:
dm = WikiDataModule(
    batch_size=4,
    data_dir="/content/drive/MyDrive/history/datasets/wiki/"
)
model = MultiTaskLearningModel()
logger = TensorBoardLogger(
    save_dir="/content/drive/MyDrive/history/logs", name="mtl"
)

  rank_zero_deprecation("DataModule property `dims` was deprecated in v1.5 and will be removed in v1.7.")


Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)


In [11]:
# Deterministic assicura la riproducibilita' dei risultati.
trainer = Trainer(
    max_epochs=max_epochs,
    accelerator="auto",
    deterministic=True,
    num_sanity_val_steps=0,
    logger=logger,
    callbacks=[checkpoint_callback, freeze_callback]
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model, dm)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type          | Params
------------------------------------------------
0 | base_model    | BertModel     | 108 M 
1 | seq_clf       | Sequential    | 591 K 
2 | tokens_clf    | Sequential    | 3.8 K 
3 | multi_loss    | MultiTaskLoss | 2     
4 | seqc_accuracy | Accuracy      | 0     
5 | tokc_accuracy | Accuracy      | 0     
6 | seqc_f1       | F1            | 0     
7 | tokc_f1       | F1            | 0     
8 | seqc_prec     | Precision     | 0     
9 | seqc_recall   | Recall        | 0     
------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.622   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Epoch number 0, freezing base.
Base frozen.


Validation: 0it [00:00, ?it/s]

Epoch number 1, freezing base.
Base frozen.


Validation: 0it [00:00, ?it/s]

Epoch number 2, freezing base.
Base frozen.


Validation: 0it [00:00, ?it/s]

Epoch number 3, freezing base.
Base frozen.


Validation: 0it [00:00, ?it/s]

Epoch number 4, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 5, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 6, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 7, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 8, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 9, unfreezing base.
Base unfrozen.


Validation: 0it [00:00, ?it/s]

Epoch number 10, unfreezing base.
Base unfrozen.


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [15]:
dm = WikiDataModule(
    batch_size=4,
    data_dir="/content/drive/MyDrive/history/datasets/wiki/"
)
model = MultiTaskLearningModel().load_from_checkpoint(
    "/content/drive/MyDrive/history/checkpoints/mtl/epoch=9-step=6920.ckpt"
)
logger = TensorBoardLogger(
    save_dir="/content/drive/MyDrive/history/logs", name="mtl"
)
#trainer.resume_from_checkpoint = "/content/drive/MyDrive/history/checkpoints/mtl/epoch=9-step=6920.ckpt"

# Qui vengono calcolate le metriche sul test set.
trainer.test(
    model,
    dm,
    ckpt_path="/content/drive/MyDrive/history/checkpoints/mtl/epoch=9-step=6920.ckpt"
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls

Testing: 0it [00:00, ?it/s]

[{'test_seqc_acc_epoch': 0.8522907495498657,
  'test_tokc_acc_epoch': 0.3574407398700714,
  'test_seqc_f1_epoch': 0.7763034701347351,
  'test_tokc_f1_epoch': 0.37700697779655457,
  'test_seqc_prec_epoch': 0.7527645826339722,
  'test_seqc_recall_epoch': 0.8522907495498657}]

### BERT classifier model

In [9]:
# Con save_top_k vengono salvati i K modelli migliori.
# Mode specifica se salvare i modelli che hanno la metrica considerata piu' alta o piu' bassa.
# Monitor specifica la metrica considerata.
checkpoint_callback = ModelCheckpoint(
    dirpath="/content/drive/MyDrive/history/checkpoints/bert_clf",
    every_n_epochs=1,
    save_top_k=1,
    monitor="valid_f1",
    mode="max"
)

In [10]:
dm = WikiDataModule(
    batch_size=4,
    data_dir="/content/drive/MyDrive/history/datasets/wiki"
)
model = Bert_clf(
    dropout_rate=0.5,
    freeze_bert=False
)
logger = TensorBoardLogger(
    save_dir="/content/drive/MyDrive/history/logs", name="bert_clf"
)
# Precision specifica la precisione dei dati utilizzati. Con 16 viene utilizzata
# la half precision. Con half precision viene utilizzata la combinazione di floating point
# in 16 e 32 bit per ridurre la memoria utilizzata.
# Max epochs specifica il numero di epoche per il training. Di default e' 1000.
# Gradient_clip_val specifica il valore al quale clippare i gradienti.
trainer = Trainer(
    max_epochs=max_epochs,
    accelerator="auto",
    deterministic=True,
    precision=16,
    gradient_clip_val=1,
    num_sanity_val_steps=0,
    logger=logger,
    callbacks=[checkpoint_callback]
)

  rank_zero_deprecation("DataModule property `dims` was deprecated in v1.5 and will be removed in v1.7.")


Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilit

In [11]:
trainer.fit(model, dm)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type      | Params
---------------------------------------
0 | bert     | BertModel | 108 M 
1 | dropout1 | Dropout   | 0     
2 | clf      | Linear    | 769   
3 | accuracy | Accuracy  | 0     
4 | f1       | F1        | 0     
5 | prec     | Precision | 0     
6 | recall   | Recall    | 0     
---------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
216.622   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [12]:
dm = WikiDataModule(
    batch_size=4,
    data_dir="/content/drive/MyDrive/history/datasets/wiki"
)
model = Bert_clf(
    dropout_rate=0.5,
    freeze_bert=False
).load_from_checkpoint(
    "/content/drive/MyDrive/history/checkpoints/bert_clf/epoch=0-step=692.ckpt",
    dropout_rate=0.5,
    freeze_bert=False
)
logger = TensorBoardLogger(
    save_dir="/content/drive/MyDrive/history/logs", name="bert_clf"
)

#trainer.resume_from_checkpoint = "/content/drive/MyDrive/history/checkpoints/bert_clf/epoch=0-step=692.ckpt"

# Qui vengono calcolate le metriche sul test set.
trainer.test(
    model,
    dm,
    ckpt_path="/content/drive/MyDrive/history/checkpoints/bert_clf/epoch=0-step=692.ckpt"
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.

Testing: 0it [00:00, ?it/s]

[{'test_acc_epoch': 0.5308057069778442,
  'test_f1_epoch': 0.5221619009971619,
  'test_prec_epoch': 0.5967614054679871,
  'test_recall_epoch': 0.5308057069778442}]