Inspired by [this notebook](https://github.com/ClimateBert/training-example)

In [2]:
import pandas as pd
import numpy as np

import torch

from transformers import EarlyStoppingCallback
from transformers import RobertaTokenizerFast
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_utils import set_seed

from datasets import load_dataset

In [3]:
num_classes = 11
set_seed(42)
ds = load_dataset("rexarski/TCFD_disclosure")

Downloading readme:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/rexarski___parquet/rexarski--TCFD_disclosure-612a20a038979535/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/593 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/rexarski___parquet/rexarski--TCFD_disclosure-612a20a038979535/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
set(ds["train"]["label"])

{'Governance a)',
 'Governance b)',
 'Metrics and Targets a)',
 'Metrics and Targets b)',
 'Metrics and Targets c)',
 'Risk Management a)',
 'Risk Management b)',
 'Risk Management c)',
 'Strategy a)',
 'Strategy b)',
 'Strategy c)'}

In [5]:
label_to_id = {
    "Governance a)": 0,
    "Governance b)": 1,
    "Metrics and Targets a)": 2,
    "Metrics and Targets b)": 3,
    "Metrics and Targets c)": 4,
    "Risk Management a)": 5,
    "Risk Management b)": 6,
    "Risk Management c)": 7,
    "Strategy a)": 8,
    "Strategy b)": 9,
    "Strategy c)": 10,
}

In [6]:
texts = np.array(ds["train"]["text"])
labels = [label_to_id[x] for x in ds["train"]["label"]]
labels = np.array(labels)

assert len(texts) == len(labels)  # 593

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.05, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.3, random_state=42
)

In [8]:
print(f"Train samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

Train samples: 394
Validation samples: 169
Test samples: 30


In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained("distilroberta-base")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)


class TCFDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TCFDDataset(train_encodings, train_labels)
val_dataset = TCFDDataset(val_encodings, val_labels)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [19]:
training_args = TrainingArguments(
    output_dir="rexarski/distilroberta-tcfd-disclosure",  # output directory
    # overwrite_output_dir=True,
    push_to_hub=True,
    num_train_epochs=20,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=50,  # number of warmup steps for learning rate scheduler
    weight_decay=0.02,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    fp16=True,  # enable mixed precision training if supported by GPU
    gradient_accumulation_steps=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

model = RobertaForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=num_classes
)
model.to(device)

early_stop = EarlyStoppingCallback(3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stop],
)

trainer.train()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

Epoch,Training Loss,Validation Loss
1,No log,2.39868
2,2.402600,2.391503
3,2.402600,2.380177
4,2.379000,2.365997
5,2.379000,2.330592
6,2.311700,2.221888
7,2.311700,2.04847
8,1.988300,1.909535
9,1.988300,1.811157
10,1.597100,1.725075


TrainOutput(global_step=80, training_loss=1.664175134897232, metrics={'train_runtime': 346.8625, 'train_samples_per_second': 22.718, 'train_steps_per_second': 0.288, 'total_flos': 569312052423744.0, 'train_loss': 1.664175134897232, 'epoch': 16.0})

In [20]:
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
test_dataset = TCFDDataset(test_encodings, test_labels)

x = trainer.predict(test_dataset)[0]

In [21]:
with open("output.txt", "a", encoding="utf-8") as fd:
    for i, sent in enumerate(test_dataset):
        fd.write(
            f"{test_texts[i]}\t{test_labels[i]}\t{x[i,0]}\t{x[i,1]}\t{x[i,2]}\t{x[i,3]}\t{x[i,4]}\t{x[i,5]}\t{x[i,6]}\t{x[i,7]}\t{x[i,8]}\t{x[i,9]}\t{x[i,10]}\n "
        )

In [22]:
df = pd.read_csv(
    "output.txt",
    sep="\t",
    header=None,
    names=[
        "text",
        "label",
        "pred_0",
        "pred_1",
        "pred_2",
        "pred_3",
        "pred_4",
        "pred_5",
        "pred_6",
        "pred_7",
        "pred_8",
        "pred_9",
        "pred_10",
    ],
)

df["pred_class"] = np.argmax(
    df[
        [
            "pred_0",
            "pred_1",
            "pred_2",
            "pred_3",
            "pred_4",
            "pred_5",
            "pred_6",
            "pred_7",
            "pred_8",
            "pred_9",
            "pred_10",
        ]
    ].values,
    axis=1,
)

X = []
y = []

In [23]:
df.head()

Unnamed: 0,text,label,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_class
0,"In the first half of 2019, we ran pilot portfo...",4,0.079346,-0.141479,0.430908,-0.222168,-1.168945,2.810547,1.28125,0.059387,-1.760742,-0.810059,-0.623535,5
1,ATP has also chosen to calculate the equity p...,3,-0.685059,-0.585938,2.099609,2.578125,-0.010925,0.246338,-1.539062,-1.542969,-1.261719,-0.214111,0.770508,3
2,The Group constantly monitors developments in...,6,0.182861,0.053558,-1.0625,-1.419922,-1.745117,3.355469,2.429688,0.236572,-1.054688,-0.16687,-0.439209,5
3,It is essential that risk assessment and risk...,7,-0.136475,-0.103394,-1.859375,-1.93457,-1.625977,1.939453,2.447266,2.253906,-0.337402,-0.072021,-0.391357,6
4,Sustainability is not only our core value and...,9,-0.630859,-1.203125,-0.012466,-1.451172,0.28833,-0.716309,0.94873,0.316162,0.320068,2.849609,-0.571289,9


In [24]:
print(f"The accuracy in testing dataset is {np.mean(df.label == df.pred_class):.4f}")

The accuracy in testing dataset is 0.3667


In [25]:
df[["label", "pred_class"]]

Unnamed: 0,label,pred_class
0,4,5
1,3,3
2,6,5
3,7,6
4,9,9
5,2,2
6,9,9
7,9,8
8,6,5
9,1,0


In [18]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [26]:
trainer.push_to_hub()

To https://huggingface.co/rexarski/distilroberta-tcfd-disclosure
   35054b3..e58dc21  main -> main

   35054b3..e58dc21  main -> main

To https://huggingface.co/rexarski/distilroberta-tcfd-disclosure
   e58dc21..fe4bc8e  main -> main

   e58dc21..fe4bc8e  main -> main



'https://huggingface.co/rexarski/distilroberta-tcfd-disclosure/commit/e58dc216bb17de17ecc5faa5504ec07fa6cc1760'