In [2]:
from datasets import load_dataset
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AdamW,
)
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import time
from tqdm import tqdm
from sklearn.metrics import f1_score

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
base_model = "bert-base-uncased"
# model_checkpoint = 'climatebert/distilroberta-base-climate-f'
ds = load_dataset("rexarski/climate_fever_fixed")

Downloading readme:   0%|          | 0.00/914 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/rexarski___parquet/rexarski--climate_fever_fixed-967e3bdb8fd2c62b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/331k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/763k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/279k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating valid split:   0%|          | 0/1842 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/4298 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/rexarski___parquet/rexarski--climate_fever_fixed-967e3bdb8fd2c62b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
ds

DatasetDict({
    valid: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1842
    })
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 4298
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1535
    })
})

In [22]:
class climate_fever_f_bert(Dataset):
    def __init__(self, ds, base_model):
        self.label_dict = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2}

        self.train_df = ds["train"]
        self.val_df = ds["valid"]
        self.test_df = ds["test"]

        # pretrained base model tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.init_data()

    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        self.test_data = self.load_data(self.test_df)

    def load_data(self, df):
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        claim_list = df["claim"]
        evidence_list = df["evidence"]
        label_list = df["label"]

        for claim, evidence, label in zip(claim_list, evidence_list, label_list):
            claim_id = self.tokenizer.encode(
                claim, add_special_tokens=False, truncation=True, max_length=MAX_LEN
            )
            evidence_id = self.tokenizer.encode(
                evidence, add_special_tokens=False, truncation=True, max_length=MAX_LEN
            )
            pair_token_ids = (
                [self.tokenizer.cls_token_id]
                + claim_id
                + [self.tokenizer.sep_token_id]
                + evidence_id
                + [self.tokenizer.sep_token_id]
            )
            # if len(pair_token_ids) > MAX_LEN:
            #     pair_token_ids = pair_token_ids[:MAX_LEN]
            claim_len = len(claim_id)
            evidence_len = len(evidence_id)

            segment_ids = torch.tensor(
                [0] * (claim_len + 2) + [1] * (evidence_len + 1)
            )  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor(
                [1] * (claim_len + evidence_len + 3)
            )  # mask padded values

            # if len(segment_ids) > MAX_LEN:
            #     segment_ids = segment_ids[:MAX_LEN]
            # if len(attention_mask_ids) > MAX_LEN:
            #     attention_mask_ids = attention_mask_ids[:MAX_LEN]

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            # y.append(self.label_dict[label])
            y.append(label)

        print(f"token_ids length: {len(token_ids)}")
        print(f"seg_ids length: {len(seg_ids)}")
        print(f"mask_ids length: {len(mask_ids)}")
        print(f"y length: {len(y)}")

        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
        return dataset

    def get_data_loaders(self, batch_size=32, shuffle=True):
        train_loader = DataLoader(
            self.train_data, shuffle=shuffle, batch_size=batch_size, drop_last=True
        )

        val_loader = DataLoader(
            self.val_data, shuffle=shuffle, batch_size=batch_size, drop_last=True
        )

        test_loader = DataLoader(
            self.val_data, shuffle=shuffle, batch_size=batch_size, drop_last=True
        )

        return train_loader, val_loader, test_loader

In [23]:
climate_dataset = climate_fever_f_bert(ds, base_model)
train_loader, val_loader, test_loader = climate_dataset.get_data_loaders(batch_size=8)

token_ids length: 4298
seg_ids length: 4298
mask_ids length: 4298
y length: 4298
4298
token_ids length: 1842
seg_ids length: 1842
mask_ids length: 1842
y length: 1842
1842
token_ids length: 1535
seg_ids length: 1535
mask_ids length: 1535
y length: 1535
1535


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(base_model)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
special_tokens = [
    "CO2",
    "emissions",
    "temperature",
    "environmental",
    "soil",
    "increase",
    "conditions",
    "potential",
    "increased",
    "areas",
    "degrees",
    "across",
    "systems",
    "emission",
    "precipitation",
    "impacts",
    "compared",
    "countries",
    "sustainable",
    "provide",
    "reduction",
    "annual",
    "reduce",
    "greenhouse",
    "approach",
    "processes",
    "factors",
    "observed",
    "renewable",
    "temperatures",
    "distribution",
    "studies",
    "variability",
    "significantly",
    "–",
    "further",
    "regions",
    "addition",
    "showed",
    '"',
    "industry",
    "consumption",
    "regional",
    "risks",
    "atmospheric",
    "supply",
    "companies",
    "plants",
    "biomass",
    "electricity",
    "respectively",
    "activities",
    "communities",
    "climatic",
    "solar",
    "investment",
    "spatial",
    "rainfall",
    "•",
    "sustainability",
    "costs",
    "reduced",
    "2021",
    "influence",
    "vegetation",
    "sources",
    "possible",
    "ecosystem",
    "scenarios",
    "summer",
    "drought",
    "structure",
    "economy",
    "considered",
    "various",
    "atmosphere",
    "several",
    "technologies",
    "transition",
    "assessment",
    "dioxide",
    "ocean",
    "fossil",
    "patterns",
    "waste",
    "solutions",
    "transport",
    "strategy",
    "CH4",
    "policies",
    "understanding",
    "concentration",
    "customers",
    "methane",
    "applied",
    "increases",
    "estimated",
    "flood",
    "measured",
    "thermal",
    "concentrations",
    "decrease",
    "greater",
    "following",
    "proposed",
    "trends",
    "basis",
    "provides",
    "operations",
    "differences",
    "hydrogen",
    "adaptation",
    "methods",
    "capture",
    "variation",
    "reducing",
    "N2O",
    "parameters",
    "ecosystems",
    "investigated",
    "yield",
    "strategies",
    "indicate",
    "caused",
    "dynamics",
    "obtained",
    "efforts",
    "coastal",
    "become",
    "agricultural",
    "decreased",
    "GHG",
    "materials",
    "mainly",
    "relationship",
    "+/-",
    "challenges",
    "nitrogen",
    "forests",
    "trend",
    "estimates",
    "towards",
    "Committee",
    "seasonal",
    "developing",
    "particular",
    "importance",
    "tropical",
    "ratio",
    "2030",
    "composition",
    "employees",
    "characteristics",
    "scenario",
    "measurements",
    "plans",
    "fuels",
    "infrastructure",
    "overall",
    "responses",
    "presented",
    "least",
    "assess",
    "diversity",
    "periods",
    "delta",
    "included",
    "already",
    "targets",
    "achieve",
    "affect",
    "conducted",
    "operating",
    "populations",
    "variations",
    "studied",
    "additional",
    "construction",
    "northern",
    "variables",
    "soils",
    "ensure",
    "recovery",
    "combined",
    "decision",
    "practices",
    "however",
    "determined",
    "resulting",
    "mitigation",
    "conservation",
    "estimate",
    "identify",
    "observations",
    "losses",
    "productivity",
    "agreement",
    "monitoring",
    "investments",
    "pollution",
    "contribution",
    "opportunities",
    "simulations",
    "gases",
    "statements",
    "planning",
    "shares",
    "sediment",
    "flux",
    "requirements",
    "trees",
    "temporal",
    "determine",
    "southern",
    "previous",
    "integrated",
    "relatively",
    "analyses",
    "means",
    "2050",
    '"',
    "uncertainty",
    "pandemic",
    "fluxes",
    "findings",
    "moisture",
    "consistent",
    "decades",
    "snow",
    "performed",
    "contribute",
    "crisis",
]

num_added_toks = tokenizer.add_tokens(special_tokens)

print(f"Number of tokens added: {num_added_toks}")

Number of tokens added: 11


In [10]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.01,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay_rate": 0.0,
    },
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)


def multi_acc(y_pred, y_test):
    acc = (
        torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test
    ).sum().float() / float(y_test.size(0))
    return acc



In [11]:
EPOCHS = 5


def train(model, train_loader, val_loader, optimizer):
    total_step = len(train_loader)
    best_val_acc = 0

    for epoch in range(EPOCHS):
        start = time.time()
        model.train()

        total_train_acc = 0
        total_train_loss = 0

        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(
            enumerate(train_loader)
        ):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            # print(f"pair_token_ids shape: {pair_token_ids.shape}")
            # print(f"mask_ids shape: {mask_ids.shape}")
            # print(f"seg_ids shape: {seg_ids.shape}")
            # print(f"y shape: {y.shape}")

            loss, prediction = model(
                pair_token_ids,
                token_type_ids=seg_ids,
                attention_mask=mask_ids,
                labels=labels,
            ).values()

            acc = multi_acc(prediction, labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

        train_acc = total_train_acc / len(train_loader)
        train_loss = total_train_loss / len(train_loader)

        model.eval()

        total_val_acc = 0
        total_val_f1 = 0
        total_val_loss = 0

        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(
                val_loader
            ):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                loss, prediction = model(
                    pair_token_ids,
                    token_type_ids=seg_ids,
                    attention_mask=mask_ids,
                    labels=labels,
                ).values()

                acc = multi_acc(prediction, labels)

                total_val_loss += loss.item()
                total_val_acc += acc.item()

        val_acc = total_val_acc / len(val_loader)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")
        val_loss = total_val_loss / len(val_loader)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print(
            f"Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}"
        )
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))

In [12]:
train(model, train_loader, val_loader, optimizer)

537it [06:01,  1.48it/s]


Epoch 1: train_loss: 0.8906 train_acc: 0.6338 | val_loss: 0.8580 val_acc: 0.6484
00:06:44.19


537it [05:58,  1.50it/s]


Epoch 2: train_loss: 0.8843 train_acc: 0.6362 | val_loss: 0.8681 val_acc: 0.6484
00:06:39.72


537it [05:58,  1.50it/s]


Epoch 3: train_loss: 0.8820 train_acc: 0.6373 | val_loss: 0.8605 val_acc: 0.6495
00:06:41.18


537it [05:59,  1.50it/s]


Epoch 4: train_loss: 0.8828 train_acc: 0.6373 | val_loss: 0.8609 val_acc: 0.6489
00:06:39.91


537it [05:58,  1.50it/s]


Epoch 5: train_loss: 0.8786 train_acc: 0.6350 | val_loss: 0.8574 val_acc: 0.6489
00:06:39.92


In [13]:
# Pick the best model and push it to huggingface

In [14]:
# load the saved state dict
state_dict = torch.load("best_model.pth")

# assign the state dict to the model
model.load_state_dict(state_dict)

<All keys matched successfully>

In [15]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [16]:
training_args = TrainingArguments(
    output_dir="rexarski/bert-base-climate-fever-fixed", push_to_hub=True
)
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer)

Cloning https://huggingface.co/rexarski/bert-base-climate-fever-fixed into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.0k/418M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/418M [00:00<?, ?B/s]

In [17]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/418M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.56k [00:00<?, ?B/s]

To https://huggingface.co/rexarski/bert-base-climate-fever-fixed
   e27dab6..2331fa1  main -> main

   e27dab6..2331fa1  main -> main

To https://huggingface.co/rexarski/bert-base-climate-fever-fixed
   2331fa1..114379c  main -> main

   2331fa1..114379c  main -> main



'https://huggingface.co/rexarski/bert-base-climate-fever-fixed/commit/2331fa153679199feebb2195ff6d661e97f03111'

In [24]:
total_test_acc = 0
total_test_loss = 0

with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(test_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        loss, prediction = model(
            pair_token_ids,
            token_type_ids=seg_ids,
            attention_mask=mask_ids,
            labels=labels,
        ).values()

        acc = multi_acc(prediction, labels)

        total_test_loss += loss.item()
        total_test_acc += acc.item()

test_acc = total_test_acc / len(test_loader)
test_loss = total_test_loss / len(test_loader)

In [26]:
print(f"The accuracy of testing split is: {test_acc:.4f}")
print(f"The loss of testing split is: {test_loss:.4f}")

The accuracy of testing split is: 0.6484
The loss of testing split is: 0.8613
