In [13]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import AdamW
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [1]:
ds = load_dataset('rexarski/climate_fever_fixed')

Downloading readme: 100%|██████████| 914/914 [00:00<00:00, 435kB/s]


Downloading and preparing dataset None/None to /home/codespace/.cache/huggingface/datasets/rexarski___parquet/rexarski--climate_fever_fixed-967e3bdb8fd2c62b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 763k/763k [00:00<00:00, 41.1MB/s]
Downloading data: 100%|██████████| 279k/279k [00:00<00:00, 55.4MB/s]]
Downloading data: 100%|██████████| 331k/331k [00:00<00:00, 29.4MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  4.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1203.88it/s]
                                                                       

Dataset parquet downloaded and prepared to /home/codespace/.cache/huggingface/datasets/rexarski___parquet/rexarski--climate_fever_fixed-967e3bdb8fd2c62b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 376.16it/s]


In [4]:
train_split_dict_list = [{k: v for k, v in item.items()} for item in ds["train"]]
train_df = pd.DataFrame(train_split_dict_list)

test_split_dict_list = [{k: v for k, v in item.items()} for item in ds["test"]]
test_df = pd.DataFrame(test_split_dict_list)

val_split_dict_list = [{k: v for k, v in item.items()} for item in ds["valid"]]
val_df = pd.DataFrame(val_split_dict_list)

In [5]:
print(train_df.dtypes)

train_df['claim'] = train_df['claim'].astype(str)
train_df['evidence'] = train_df['evidence'].astype(str)

val_df['claim'] = val_df['claim'].astype(str)
val_df['evidence'] = val_df['evidence'].astype(str)

claim_id     int64
claim       object
evidence    object
label        int64
category    object
dtype: object


In [6]:
def trim_text(df, threshold=512):
    df = df.assign(claim_length=df['claim'].apply(lambda x: len(x)))
    df = df.assign(evidence_length=df['evidence'].apply(lambda x: len(x)))
    df = df.assign(total_length=lambda x: x['claim_length'] + x['evidence_length']).sort_values('total_length', ascending=False)
    df = df[df['total_length'] <= threshold]
    return df

train_df = trim_text(train_df, 128)
val_df = trim_text(val_df, 128)

In [7]:
train_df.head()

Unnamed: 0,claim_id,claim,evidence,label,category,claim_length,evidence_length,total_length
8,2083,Climategate CRU emails suggest conspiracy,"""'Conspiracy theories finally laid to rest' by...",1,Phil Jones (climatologist),41,87,128
4165,2168,IPCC graph showing accelerating trends is misl...,The IPCC needs to look at this trend in the er...,0,Intergovernmental Panel on Climate Change,52,76,128
753,2401,Mother Earth has clearly ruled that CO2 is not...,Its soil is utterly barren and its atmosphere ...,2,Earth in science fiction,60,68,128
1609,2330,Arctic sea ice has been retreating over the pa...,"""What drove the dramatic arctic sea ice retrea...",2,Global warming,58,69,127
1045,1014,Final data for 2016 sea level rise have yet to...,"Between 1900 and 2016, the sea level rose by 1...",1,Sea level rise,60,67,127


In [8]:
print(train_df.shape)
print(val_df.shape)

(151, 8)
(64, 8)


In [11]:
base_model = "distilroberta-base"
model_checkpoint = "climatebert/distilroberta-base-climate-f"

In [21]:
class climateFever(Dataset):

    def __init__(self, train_df, val_df):
        self.label_dict = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2}

        self.train_df = train_df
        self.val_df = val_df

        self.base_path = '/content/'
        self.tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
        self.train_data = None
        self.val_data = None
        self.init_data()

    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)

    def load_data(self, df):
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['claim'].to_list()
        hypothesis_list = df['evidence'].to_list()
        label_list = df['label'].to_list()

        for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypothesis_len = len(hypothesis_id)

            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))
            attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            # y.append(self.label_dict[label])
            y.append(label)

        token_ids = pad_sequence(token_ids, batch_first = True)
        mask_ids = pad_sequence(mask_ids, batch_first = True)
        seg_ids = pad_sequence(seg_ids, batch_first = True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
        return dataset
    
    def get_data_loaders(self, batch_size=32, shuffle=True):
        train_loader = DataLoader(
            self.train_data,
            shuffle=shuffle,
            batch_size=batch_size
        )

        val_loader = DataLoader(
            self.val_data,
            shuffle=shuffle,
            batch_size=batch_size
        )

        return train_loader, val_loader


    # def __init__(self, ds, base_model):
    #     self.label_dict = {"entailment": 0, "contradiction": 1, "neutral": 2}

    #     # self.train_df = ds["train"]
    #     # self.val_df = ds["valid"]
    #     # # or:
    #     train_split_dict_list = [{k: v for k, v in item.items()} for item in ds["train"]]
    #     val_split_dict_list = [{k: v for k, v in item.items()} for item in ds["valid"]]
    #     self.train_df = pd.DataFrame(train_split_dict_list)[:500]
    #     self.val_df = pd.DataFrame(val_split_dict_list)[:100]
        
    #     self.base_path = "/content/"
    #     self.tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
    #     self.train_data = None
    #     self.val_data = None
    #     self.init_data()

    # def init_data(self):
    #     self.train_data = self.load_data(self.train_df)
    #     self.val_data = self.load_data(self.val_df)
    
    # def load_data(self, df):
    #     MAX_LEN = 512
    #     token_ids = []
    #     mask_ids = []
    #     seg_ids = []
    #     y = []

    #     premise_list = df["claim"]#.to_list()
    #     hypothesis_list = df["evidence"]#.to_list()
    #     label_list = df["label"]#.to_list()

    #     for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
    #         premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
    #         hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
    #         pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
            
    #         premise_len = len(premise_id)
    #         hypothesis_len = len(hypothesis_id)

    #         segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
    #         attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

    #         token_ids.append(torch.tensor(pair_token_ids))
    #         seg_ids.append(segment_ids)
    #         mask_ids.append(attention_mask_ids)
    #         y.append(self.label_dict[label])
        
    #     token_ids = pad_sequence(token_ids, batch_first=True)
    #     mask_ids = pad_sequence(mask_ids, batch_first=True)
    #     seg_ids = pad_sequence(seg_ids, batch_first=True)
    #     y = torch.tensor(y)
    #     dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    #     print(len(dataset))
    #     return dataset
    
    # def get_data_loaders(self, batch_size=32, shuffle=True):
    #     train_loader = DataLoader(
    #         self.train_data,
    #         shuffle=shuffle,
    #         batch_size=batch_size
    #     )

    #     val_loader = DataLoader(
    #         self.val_data,
    #         shuffle=shuffle,
    #         batch_size=batch_size
    #     )

    #     return train_loader, val_loader


In [22]:
# processed_dataset = climateFever(ds, base_model)
processed_dataset = climateFever(train_df, val_df)
train_loader, val_loader = processed_dataset.get_data_loaders(batch_size=8)

151
64


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)
# model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, num_labels=3)

model.to(device)

Downloading (…)lve/main/config.json: 100%|██████████| 752/752 [00:00<00:00, 337kB/s]
Downloading pytorch_model.bin: 100%|██████████| 329M/329M [00:04<00:00, 77.5MB/s] 
Some weights of the model checkpoint at climatebert/distilroberta-base-climate-f were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initializ

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

: 

In [1]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     "weight_decay_rate": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     "weight_decay_rate": 0.0},
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

NameError: name 'model' is not defined

In [None]:
import time

EPOCHS = 5


def train(model, train_loader, val_loader, optimizer):
    total_step = len(train_loader)

    for epoch in range(EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc = 0
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
            optimizer.zero_grad()
            print(f"pair_token_ids shape: {pair_token_ids.shape}")
            print(f"mask_ids shape: {mask_ids.shape}")
            print(f"seg_ids shape: {seg_ids.shape}")
            print(f"y shape: {y.shape}")
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            # prediction = model(pair_token_ids, mask_ids, seg_ids)
            loss, prediction = model(pair_token_ids, token_type_ids=seg_ids, attention_mask=mask_ids, labels=labels).values()

            # loss = criterion(prediction, labels)
            acc = multi_acc(prediction, labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

        train_acc = total_train_acc / len(train_loader)
        train_loss = total_train_loss / len(train_loader)
        model.eval()
        total_val_acc = 0
        total_val_loss = 0
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                # prediction = model(pair_token_ids, mask_ids, seg_ids)
                loss, prediction = model(pair_token_ids, token_type_ids=seg_ids, attention_mask=mask_ids, labels=labels).values()

                # loss = criterion(prediction, labels)
                acc = multi_acc(prediction, labels)

                total_val_loss += loss.item()
                total_val_acc += acc.item()

        val_acc = total_val_acc / len(val_loader)
        val_loss = total_val_loss / len(val_loader)
        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print(f"Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}")
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))


train(model, train_loader, val_loader, optimizer)

In [None]:
# Inference Example

# model = AutoModelForSequenceClassification.from_pretrained(
#     "amandakonet/climatebert-fact-checking", use_auth_token=True
# )
# tokenizer = AutoTokenizer.from_pretrained(
#     "amandakonet/climatebert-fact-checking", use_auth_token=True
# )

# features = tokenizer(
#     ["Beginning in 2005, however, polar ice modestly receded for several years"],
#     ['Polar Discovery "Continued Sea Ice Decline in 2005'],
#     padding="max_length",
#     truncation=True,
#     return_tensors="pt",
#     max_length=512,
# )

# model.eval()
# with torch.no_grad():
#     scores = model(**features).logits
#     label_mapping = ["contradiction", "entailment", "neutral"]
#     labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
#     print(labels)