In [92]:
%load_ext autoreload
%autoreload

from src.early_stopping import EarlyStopping
from src.common import get_device, read_json, DATA_PATH, format_time
from src.quantemp_processor import QuantempProcessor

import time
import transformers
import torch
import logging
import os
from typing import Dict, Tuple
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch import nn

print("Transformers version:", transformers.__version__)
print("PyTorch version:", torch.__version__)

# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

logging.basicConfig(level=logging.ERROR)

device = get_device()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Transformers version: 4.47.1
PyTorch version: 2.5.1
MPS: 1


In [94]:
train_data = read_json(f"{DATA_PATH}/raw_data/train_claims_quantemp.json")
val_data = read_json(f"{DATA_PATH}/raw_data/val_claims_quantemp.json")

print(len(train_data))
print(len(val_data))

train_data[0]

9935
3084


{'crawled_date': '2019-07-06',
 'country_of_origin': 'india',
 'label': 'False',
 'url': 'https://www.indiatoday.in/fact-check/story/fact-check-nirmala-sitharaman-35000-crore-led-bulbs-1563569-2019-07-06',
 'lang': 'en',
 'claim': 'In her budget speech, Nirmala Sitharaman claimed that the Government distributed 35,000 crore LED bulbs in the country.',
 'doc': 'Did Finance Minister Nirmala Sitharaman claim the government distributed 35,000 crore LED bulbs under the Ujala scheme? This would imply the Modi govt gave about 300 bulbs to every person in India. At least this is what is being claimed by some social media users who are sharing a screenshot from a news segment on business channel CNBC Awaaz. The photo shows Sitharaman delivering her budget speech while a caption at the bottom reads - "35,000 crore LED bulb baantein gaye" (35,000 crore LED bulbs were distributed). The snapshot gives the impression that Sitharaman said this sentence in her speech. Netizens are displaying shock at 

In [10]:
%autoreload

tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


def get_feature(fact: Dict[str, str]) -> str:
    claim = fact["claim"]
    feature = "[Claim]:" + claim + "[Evidences]:" + fact["doc"]
    return feature


def encode(feature: str) -> Tuple[torch.Tensor, torch.Tensor]:
    encoded_dict = tokenizer.encode_plus(
        feature,
        add_special_tokens=True,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']


data_processor = QuantempProcessor(get_feature, encode)

In [11]:
%autoreload
train_dataset = data_processor.fit_transform(train_data)

100%|██████████| 9935/9935 [00:39<00:00, 248.82it/s]


In [12]:
%autoreload
val_dataset = data_processor.transform(val_data)

100%|██████████| 3084/3084 [00:11<00:00, 266.95it/s]


In [13]:
batch_size = 16
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [40]:
class MultiClassClassifier(nn.Module):
    def __init__(self, model_path, labels_count, mlp_dim, dropout=0, freeze_backbone=False):
        super().__init__()

        self.gpt2 = transformers.GPT2Model.from_pretrained(
            model_path, output_attentions=True,
            output_hidden_states=True,
            attn_implementation='eager')

        hidden_dim = self.gpt2.config.hidden_size
        self.mlp = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, labels_count)
        )

        if freeze_backbone:
            for param in self.gpt2.parameters():
                param.requires_grad = False

    def forward(self, tokens, masks):
        backbone_output = self.gpt2(tokens, attention_mask=masks)
        last_hidden_state = backbone_output[0]  # Shape: [batch_size, seq_len, hidden_size]
        cls_representation = last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]

        # Ensure cls_representation is of shape [batch_size, hidden_dim] (768 in your case)
        mlp_output = self.mlp(cls_representation)
        return mlp_output


num_classes = len(data_processor.label_encoder_.classes_)
model = MultiClassClassifier("gpt2", num_classes, mlp_dim=1024, dropout=0.1)

output = model.forward(train_dataset.tensors[0][:16], train_dataset.tensors[1][:16])
labels = train_dataset.tensors[2][:16]

model = model.to(device)

In [26]:
epochs = 20

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

total_steps = len(train_dataloader) * epochs
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

In [42]:
# for param in model.gpt2.encoder.layer[0:5].parameters():
#     param.requires_grad=False

In [43]:
loss_func = nn.CrossEntropyLoss()


In [46]:
import random
import os
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
early_stopping = EarlyStopping(patience=3, verbose=True)
# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questimport gensim.downloader as api
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        probas = model(b_input_ids, b_input_mask)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        loss = loss_func(probas, b_labels)
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        # scheduler.step()
        logits = probas.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print(" Train Accuracy: {0:.2f}".format(avg_train_accuracy))

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.

            logits = model(b_input_ids, b_input_mask)

        # Accumulate the validation loss.
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    output_dir = 'finqa_roberta_claimdecomp_continued/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    # !rm -rf "/content/drive/My Drive/ecir_compnumfacts/finqa_roberta_claimdecomp_continued"
    # !mv finqa_roberta_claimdecomp_continued "/content/drive/My Drive/ecir_compnumfacts/"
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))


Training...
  Batch    40  of    621.    Elapsed: 1:14:24.
  Batch    80  of    621.    Elapsed: 2:31:59.
  Batch   120  of    621.    Elapsed: 3:55:02.
  Batch   160  of    621.    Elapsed: 5:29:24.
  Batch   200  of    621.    Elapsed: 7:01:18.


KeyboardInterrupt: 

In [49]:
tokenizer.save_pretrained("models/gpt2-ft-tokenizer")
torch.save(model.state_dict(), os.path.join("../models/", 'model_weights'))

# !rm -rf "/content/drive/My Drive/ecir_compnumfacts/finqa_roberta_claimdecomp_continued_early_stop"
# !mv finqa_roberta_claimdecomp_continued_early_stop "/content/drive/My Drive/ecir_compnumfacts/"

In [None]:
LE.inverse_transform([0, 1, 2])

In [None]:


from huggingface_hub import login

In [None]:
login()

In [None]:
LE.classes_