In [19]:
import pandas as pd
import numpy as np
import json

import torch

from transformers import EarlyStoppingCallback
from transformers import RobertaTokenizerFast
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_utils import set_seed

In [3]:
num_classes = 5

label_to_id = {
    "Governance": 0,
    "Metrics and Targets": 1,
    "Risk Management": 2,
    "Strategy": 3,
    "None": 4,
}

np.random.seed(0)
set_seed(0)

In [4]:
class TCFDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [22]:
texts = []
labels = []

import requests

url = "https://raw.githubusercontent.com/ClimateBert/training-example/main/training_data.json"
response = requests.get(url)
training_data = json.loads(response.text)

for sample in training_data:
    texts.append(sample["text"])
    labels.append(label_to_id[sample["label"]])

assert len(texts) == len(labels)

texts = np.array(texts)
labels = np.array(labels)

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [25]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.05, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.3, random_state=42
)

In [26]:
print(f"Train samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

Train samples: 34886
Validation samples: 14952
Test samples: 2624


In [27]:
tokenizer = RobertaTokenizerFast.from_pretrained("distilroberta-base")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [28]:
train_dataset = TCFDDataset(train_encodings, train_labels)
val_dataset = TCFDDataset(val_encodings, val_labels)

In [38]:
training_args = TrainingArguments(
    output_dir="rexarski/distilroberta-tcfd-disclosure-5",  # output directory
    # overwrite_output_dir=True,
    push_to_hub=True,
    num_train_epochs=10,  # total number of training epochs
    per_device_train_batch_size=24,  # batch size per device during training
    per_device_eval_batch_size=24,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    fp16=True,  # enable mixed precision training if supported by GPU
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

model = RobertaForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=num_classes
)

model.to(device)

early_stop = EarlyStoppingCallback(2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stop],
)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

In [30]:
trainer.train()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

Epoch,Training Loss,Validation Loss
0,0.5919,0.613401
2,0.5703,0.518818
2,0.4197,0.480462
4,0.3114,0.512656
4,0.2729,0.542843


TrainOutput(global_step=1817, training_loss=0.4900701623994306, metrics={'train_runtime': 3307.9393, 'train_samples_per_second': 105.461, 'train_steps_per_second': 1.097, 'total_flos': 2.31075245511168e+16, 'train_loss': 0.4900701623994306, 'epoch': 5.0})

In [31]:
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
test_dataset = TCFDDataset(test_encodings, test_labels)

x = trainer.predict(test_dataset)[0]

In [33]:
with open("output.txt", "a", encoding="utf-8") as fd:
    for i, sent in enumerate(test_dataset):
        fd.write(
            f"{test_texts[i]}\t{test_labels[i]}\t{x[i,0]}\t{x[i,1]}\t{x[i,2]}\t{x[i,3]}\t{x[i,4]}\n "
        )

In [34]:
df = pd.read_csv(
    "output.txt",
    sep="\t",
    header=None,
    names=[
        "text",
        "label",
        "pred_0",
        "pred_1",
        "pred_2",
        "pred_3",
        "pred_4",
    ],
)

df["pred_class"] = np.argmax(
    df[
        [
            "pred_0",
            "pred_1",
            "pred_2",
            "pred_3",
            "pred_4",
        ]
    ].values,
    axis=1,
)

X = []
y = []

In [35]:
print(f"The accuracy in testing dataset is {np.mean(df.label == df.pred_class):.4f}")

The accuracy in testing dataset is 0.8155


In [36]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [39]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/313M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

To https://huggingface.co/rexarski/distilroberta-tcfd-disclosure-5
   3943a33..52c5634  main -> main

   3943a33..52c5634  main -> main

To https://huggingface.co/rexarski/distilroberta-tcfd-disclosure-5
   52c5634..fb47cee  main -> main

   52c5634..fb47cee  main -> main



'https://huggingface.co/rexarski/distilroberta-tcfd-disclosure-5/commit/52c5634387da0f836293605ef4975f8a374a0d9b'