In [None]:
import os
import torch
import transformers
import logging
from transformers import AutoTokenizer, AutoModel, RobertaTokenizer
import json
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch import nn
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup, AutoModelForSeq2SeqLM
import numpy as np
import time
import datetime
import random

# Set up model

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# Load the t5 tokenizer.
print('Loading t5 tokenizer...')

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

Loading t5 tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# Import data

In [None]:
# This dataset is for the decomp training later!
# with open("../../data/raw_data/train_claimdecomp_evidence_question_mapping.json") as f:
#   train_data = json.load(f)

# len(train_data), train_data[2]

In [None]:
with open("train_claims_quantemp.json") as f:
  train_data = json.load(f)

len(train_data), train_data[-1]

(9935,
 {'crawled_date': '2014-03-26T10:38:09',
  'country_of_origin': 'ukraine',
  'label': 'False',
  'url': 'https://www.stopfake.org/en/fake-commandos-from-berkut-who-refused-to-kneel-have-been-burned-alive-in-lviv/',
  'lang': 'en',
  'claim': 'FAKE:  Commandos from &#8220;Berkut&#8221; who refused to kneel have been burned alive in Lviv',
  'doc': 'The Russian TV channel ‚ÄúRussia 1‚Äù aired a program called ‚ÄúEvil spirits of Maydan: mystic of Ukrainian mayhem‚Äù. The program, among other things, referred to the claim that two soldiers of ‚ÄúBerkut‚Äù, who refused to kneel in front of Lviv Maydan and recognize the current government, allegedly were burned alive. https://www.youtube.com/watch?v=SUDH0Qbjuao This was reported by the head of the so-called Russian community of Dnepropetrovsk Victor Trukhov. He says, two ‚ÄúBerkut‚Äù solders were put on their knees publicly and then burned in Lviv. However, contrary to this claim, a fire occured in Lviv on February 20, 2014 where peop

In [None]:
with open("val_claims_quantemp.json") as f:
  val_data = json.load(f)
len(val_data), val_data[-1]

(3084,
 {'crawled_date': '2022-10-06T21:00:06',
  'country_of_origin': 'usa',
  'label': 'True',
  'url': 'https://www.politifact.com/factchecks/2021/oct/28/randy-feenstra/biden-administration-predicted-liquid-fuel-cars-ou/',
  'lang': 'en',
  'claim': 'The Biden administration "published a study concluding 4 (of) 5 new cars on the road by 2050 will still require liquid fuels."',
  'doc': 'President Joe Biden was in Michigan‚Äôs auto industry hub on Oct. 5 when he said, "the whole world knows that the future of the auto industry is electric." Rep. Randy Feenstra, R-Iowa, had a quick response, writing on Twitter: ".@POTUS no it‚Äôs not ‚Äî in fact, your own administration published a study concluding 4/5 new cars on the road by 2050 will still require liquid fuels ... "It‚Äôs past time Biden lives up to his promise to expand clean-burning #biofuels. Don‚Äôt mess with the RFS!" Feenstra is correct about the share of cars in the United States projected to use liquid fuels. The U.S. Energy

# Extract Claims, Evidence, & Labels from data

In [None]:
LE = LabelEncoder()

In [None]:
def get_features(data):
  features = []
  evidences = []

  for index, fact in enumerate(data):
    claim = fact["claim"]


    feature = "[Claim]:"+claim+"\n[Evidences]:"+fact["doc"]
    features.append(feature)
  return features

In [None]:
train_features = get_features(train_data)
len(train_features), train_features[-1]

(9935,
 '[Claim]:FAKE:  Commandos from &#8220;Berkut&#8221; who refused to kneel have been burned alive in Lviv\n[Evidences]:The Russian TV channel ‚ÄúRussia 1‚Äù aired a program called ‚ÄúEvil spirits of Maydan: mystic of Ukrainian mayhem‚Äù. The program, among other things, referred to the claim that two soldiers of ‚ÄúBerkut‚Äù, who refused to kneel in front of Lviv Maydan and recognize the current government, allegedly were burned alive. https://www.youtube.com/watch?v=SUDH0Qbjuao This was reported by the head of the so-called Russian community of Dnepropetrovsk Victor Trukhov. He says, two ‚ÄúBerkut‚Äù solders were put on their knees publicly and then burned in Lviv. However, contrary to this claim, a fire occured in Lviv on February 20, 2014 where people from security forces were caught in a fire. The fire started after a powerful explosion in the security forces basis, after which one officer in uniform and one in civilian clothes were pulled from the rubble. Commandos from ‚ÄúB

In [None]:
val_features = get_features(val_data)
len(val_features), val_features[-1]

(3084,
 '[Claim]:The Biden administration "published a study concluding 4 (of) 5 new cars on the road by 2050 will still require liquid fuels."\n[Evidences]:President Joe Biden was in Michigan‚Äôs auto industry hub on Oct. 5 when he said, "the whole world knows that the future of the auto industry is electric." Rep. Randy Feenstra, R-Iowa, had a quick response, writing on Twitter: ".@POTUS no it‚Äôs not ‚Äî in fact, your own administration published a study concluding 4/5 new cars on the road by 2050 will still require liquid fuels ... "It‚Äôs past time Biden lives up to his promise to expand clean-burning #biofuels. Don‚Äôt mess with the RFS!" Feenstra is correct about the share of cars in the United States projected to use liquid fuels. The U.S. Energy Information Administration‚Äôs 2021 Annual Energy Outlook report, which projects the nation‚Äôs environmental plans through 2050, says about 79% of new vehicle sales will be powered by liquid fuels ‚Äî gasoline and blends that include 

In [None]:
train_labels = [fact["label"] for fact in train_data]
val_labels = [fact["label"] for fact in val_data]
len(train_labels), train_labels[-1], len(val_labels), val_labels[-1]

(9935, 'False', 3084, 'True')

In [None]:
train_labels_final = LE.fit_transform(train_labels)
len(train_labels_final), train_labels_final[:20]

(9935, array([1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 0, 1, 2, 2, 1, 1, 0]))

In [None]:
val_labels_final = LE.transform(val_labels)
len(val_labels_final), val_labels_final[:20]

(3084, array([1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1]))

In [None]:
train_labels_final = torch.tensor(train_labels_final)
val_labels_final = torch.tensor(val_labels_final)
train_labels_final.shape, val_labels_final.shape

(torch.Size([9935]), torch.Size([3084]))

In [None]:
num_classes = len(list(set(train_labels)))
num_classes, list(set(train_labels))

(3, ['False', 'True', 'Conflicting'])

# Tokenize the data

In [None]:
input_ids = []
attention_masks = []

for sent in train_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', train_features[0])
print('Token IDs:', input_ids[0])



Original:  [Claim]:In her budget speech, Nirmala Sitharaman claimed that the Government distributed 35,000 crore LED bulbs in the country.
[Evidences]:Did Finance Minister Nirmala Sitharaman claim the government distributed 35,000 crore LED bulbs under the Ujala scheme? This would imply the Modi govt gave about 300 bulbs to every person in India. At least this is what is being claimed by some social media users who are sharing a screenshot from a news segment on business channel CNBC Awaaz. The photo shows Sitharaman delivering her budget speech while a caption at the bottom reads - "35,000 crore LED bulb baantein gaye" (35,000 crore LED bulbs were distributed). The snapshot gives the impression that Sitharaman said this sentence in her speech. Netizens are displaying shock at this whopping number believing that the finance minister's statement is true. Some Congress leaders are also trolling her by sharing the screenshot of the news channel. But, India Today Anti Fake News War Room (A

In [None]:
val_input_ids = []
val_attention_masks = []

for sent in val_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    val_input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    val_attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', val_features[0])
print('Token IDs:', val_input_ids[0])

Original:  [Claim]:Amit Shah said Narendra Modi sleeps for 24 hours for the welfare of the poor.
[Evidences]:The India Today Anti-Fake News War Room found the viral video of Amit Shah's statement was clipped and presented out of context. A short video clip of Union Home Minister Amit Shah has gone viral with the claim that at a political rally, he said that Prime Minister Narendra Modi sleeps 24 hours for the welfare of the poor. Several Twitter and Facebook users shared this video clip with captions like, ‚ÄúModi ji sleeps for 24 hours‚Äù. The India Today Anti-Fake News War Room ( AFWA) found the viral video was clipped and presented out of context to give it a different meaning. In the original video, Shah can be heard saying that PM Modi thinks about the welfare of the poor 24 hours a day while ‚ÄúDidi‚Äù (Mamta Banerjee) wonders when her nephew would become the Chief Minister. Shah made the statement while addressing a public meeting in Chapra, West Bengal, in April 2021. The viral

# Create the DataLoaders from tokenized data

In [None]:
# train_poincare_tensor = torch.tensor(poincare_embeddings_final,dtype=torch.float)
# difficulty_tensor = torch.tensor(difficulty_level_vectors,dtype=torch.float)
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, train_labels_final)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels_final)

In [None]:
batch_size = 16
train_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

# Set up the model architecture

In [None]:
class MultiClassClassifier(nn.Module):
    def __init__(self, bert_model_path, labels_count, hidden_dim=768, mlp_dim=500, extras_dim=100, dropout=0.1, freeze_bert=False):
        super().__init__()
        self.base_model = bert_model_path
        self.roberta = AutoModel.from_pretrained(bert_model_path, output_hidden_states=True, output_attentions=True)
        if "t5" in self.base_model:
          self.roberta = self.roberta.encoder
        self.dropout = nn.Dropout(dropout)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, mlp_dim),
            nn.ReLU(),
            # nn.Linear(mlp_dim, mlp_dim),
            # # nn.ReLU(),
            # # nn.Linear(mlp_dim, mlp_dim),
            # nn.ReLU(),
            nn.Linear(mlp_dim, labels_count)
        )
        # self.softmax = nn.LogSoftmax(dim=1)
        if freeze_bert:
            print("Freezing layers")
            for param in self.roberta.parameters():
                param.requires_grad = False

    def forward(self, tokens, masks):
        output = self.roberta(tokens, attention_mask=masks)
        if "t5" in self.base_model:
          output = torch.mean(output[0], dim=1)
          dropout_output = self.dropout(output)
        else:
          dropout_output = self.dropout(output["pooler_output"])
        # concat_output = torch.cat((dropout_output, topic_emb), dim=1)
        # concat_output = self.dropout(concat_output)
        mlp_output = self.mlp(dropout_output)
        # proba = self.sigmoid(mlp_output)
        # proba = self.softmax(mlp_output)

        return mlp_output

In [None]:
# Loads BertForSequenceClassification, the pretrained BERT model with a single
model = MultiClassClassifier('google/flan-t5-base', 3)

# model.load_state_dict(torch.load("model_bert_difficulty_prediction/model_weights"))

# Tell pytorch to run this model on the GPU.
model.cuda()

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

MultiClassClassifier(
  (roberta): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=7

# Set up the training optimizer and learning rate schedule

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-4, # t5 needs a higher learning rate than normal
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [None]:
epochs = 20

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Set up training code

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
for param in model.roberta.block[0:5].parameters():
    param.requires_grad=False # freeze the first 5 layers of the t5 model

In [None]:
loss_func = nn.CrossEntropyLoss()

# Fine-tune the model

In [None]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
early_stopping = EarlyStopping(patience=2, verbose=True, path="checkpoint_flan-t5-base.pt")
# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questimport gensim.downloader as api
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        probas = model(b_input_ids,b_input_mask)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        loss = loss_func(probas, b_labels)
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        # scheduler.step()
        logits = probas.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print(" Train Accuracy: {0:.2f}".format(avg_train_accuracy))

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)



    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.

          logits = model(b_input_ids,b_input_mask)

        # Accumulate the validation loss.
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
      print("Early stopping")
      break
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    output_dir = 'flan-t5-base/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    # !rm -rf "/content/drive/My Drive/DSAIT4090_FinalProject/code/flan-t5/flan-t5"
    # !mv flan-t5 "/content/drive/My Drive/DSAIT4090_FinalProject/code/flan-t5"
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    621.    Elapsed: 0:00:32.
  Batch    80  of    621.    Elapsed: 0:01:01.
  Batch   120  of    621.    Elapsed: 0:01:31.
  Batch   160  of    621.    Elapsed: 0:02:02.
  Batch   200  of    621.    Elapsed: 0:02:33.
  Batch   240  of    621.    Elapsed: 0:03:04.
  Batch   280  of    621.    Elapsed: 0:03:36.
  Batch   320  of    621.    Elapsed: 0:04:07.
  Batch   360  of    621.    Elapsed: 0:04:38.
  Batch   400  of    621.    Elapsed: 0:05:09.
  Batch   440  of    621.    Elapsed: 0:05:40.
  Batch   480  of    621.    Elapsed: 0:06:11.
  Batch   520  of    621.    Elapsed: 0:06:42.
  Batch   560  of    621.    Elapsed: 0:07:13.
  Batch   600  of    621.    Elapsed: 0:07:44.
 Train Accuracy: 0.65

  Average training loss: 0.76
  Training epcoh took: 0:08:00

Running Validation...
  Accuracy: 0.70
Validation loss decreased (inf --> 0.653976).  Saving model ...
  Validation Loss: 0.65
  Validation took: 0:00:56
Saving model to flan-t5-base/

Training...


# Run inference

In [None]:
class VeracityClassifier:
    """performs stance detection."""

    def __init__(self, base_model, model_name: str = None) -> None:
        """initialized the model.

        Args:
        base_model: the backbone model to load from
            model_name (str): name or path to model
        """
        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

        self.model = MultiClassClassifier(base_model, 3)
        print(self.model)
        self.model.load_state_dict(torch.load(model_name))

    def predict(self, input: str, max_length: int = 256) -> str:
        """predicts the veracity label given claim and evidence.

        Args:
            input (str): claim with evidences
            max_legnth (int, optional): max length of sequence. Defaults to 256.

        Returns:
            str: verdict
        """

        print("claim", input)

        x = self.tokenizer.encode_plus(
            input,
            return_tensors="pt",
            return_attention_mask=True,
            truncation=True,
            max_length=max_length,
        )
        with torch.no_grad():
            logits = self.model(x["input_ids"], x["attention_mask"])
            print(logits.shape)

        probs = logits.softmax(dim=1)
        print(probs)
        label_index = probs.argmax(dim=1)

        if label_index == 2:
            label = "SUPPORTS"
        elif label_index == 0:
            label = "CONFLICTING"
        elif label_index == 1:
            label = "REFUTES"
        # else:
        #   label = "NONE"
        return label.upper(), probs


In [None]:
import sys
import json
import argparse
from typing import Dict, List
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os
from torch import Tensor
from sentence_transformers import SentenceTransformer

In [None]:
def get_verification(config):
    """Get veracity predictions."""
    with open (config["test_path"]) as f:
        facts = json.load(f)

    model_name = config["model_path"]

    nli_model = VeracityClassifier(
        base_model=config["base_model"], model_name=model_name
    )
    results = []
    matches = 0
    unmatches = 0
    verdicts = {"claim": [], "verdict": []}
    for index, fact in enumerate(facts):
        result = {"evidences": []}
        result["claim"] = fact["claim"]
        verdicts["claim"].append(fact["claim"])
        input = (
            "[Claim]: "
            + fact["claim"]
            + "\n[Evidences]:"
            + fact['doc']
        )
        pred_label, _ = nli_model.predict(input, max_length=256)
        print("pred_label", pred_label)
        if pred_label == "SUPPORTS":
            verdict = "True"
        elif pred_label == "REFUTES":
            verdict = "False"
        elif pred_label == "CONFLICTING":
            verdict = "Conflicting"

        print("Verdict:", verdict)
        verdicts["verdict"].append(verdict)
        results.append(result)
        if verdict == fact["label"]:
            matches += 1
        else:
            unmatches += 1
        print("accuracy", matches / (matches + unmatches))
    verdict_1 = pd.DataFrame(verdicts)
    print(verdict_1)
    output_path = config["output_path"]
    verdict_1.to_csv(f"{output_path}.csv", index=False)
    with open(f"{output_path}.json", "w") as f:
        json.dump(results, f, indent=4, sort_keys=True)

In [None]:
config = {
    "base_model": "google/flan-t5-base",
    "model_path": "model_weights",
    "test_path": "test_claims_quantemp.json",
    "output_path": "flan-t5-base-eval",
}

get_verification(config)

MultiClassClassifier(
  (roberta): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=7

  self.model.load_state_dict(torch.load(model_name))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor([[0.6040, 0.1155, 0.2806]])
pred_label CONFLICTING
Verdict: Conflicting
accuracy 0.6848582129481006
claim [Claim]: 80 doctors who received Covid-19 shots died in last 60 days
[Evidences]:"80 doctors in Canada between the ages of 25 and 55 have died in the last 60 days," says text above a video shared in a December 21, 2022 Instagram post from Michelle Lindsay, a leader for the People's Party of Canada in Halifax who ran for Parliament in 2021. Other posts sharing the same video can be found on Instagram -- the latest in a series of unproven claims attempting to tie the deaths of Canadian doctors to Covid-19 shots. The posts do not identify the speaker in the video, but AFP searched the phrase "80 Canadian doctors died" and found a tweet indicating the man is Chris Shoemaker, a family doctor in Ontario who regularly posts videos making inaccurate claims about Covid-19 vaccinations. The video was filmed at a protest 

In [None]:
# model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
# dir_path = os.path.dirname(os.path.realpath(os.getcwd()))

# def get_top_k_similar_instances(
#     sentence: str, data_emb: Tensor, data: List[Dict],
#     k: int, threshold: float
# ) -> List[Dict]:
#     """get top k neighbours for a sentence.

#     Args:
#         sentence (str): input
#         data_emb (Tensor): corpus embeddings
#         data (List[Dict]): corpus
#         k (int): top_k to return
#         threshold (float):

#     Returns:
#         List[Dict]: list of top_k data points
#     """
#     sent_emb = model.encode(sentence)
#     # data_emb = self.get_embeddings_for_data(transfer_questions)
#     print("new_emb", sent_emb.shape, data_emb.shape)
#     text_sims = cosine_similarity(data_emb, [sent_emb]).tolist()
#     results_sims = zip(range(len(text_sims)), text_sims)
#     sorted_similarities = sorted(
#         results_sims, key=lambda x: x[1], reverse=True)
#     print("text_sims", sorted_similarities[:2])
#     top_questions = []
#     for idx, item in sorted_similarities[:k]:
#         if item[0] > threshold:
#             top_questions.append(list(data)[idx])
#     return top_questions

# def get_verification(config):
#     """Get veracity predictions."""
#     with open(config["bm25_evidence_path"]) as f:
#         data = json.load(f)
#     with open (config["test_path"]) as f:
#         facts = json.load(f)

#     decomposed_questions = pd.read_csv(
#         config["questions_path"], sep="@"
#     )

#     model_name = config["model_path"]

#     nli_model = VeracityClassifier(
#         base_model=config["base_model"], model_name=model_name
#     )
#     results = []
#     matches = 0
#     unmatches = 0
#     verdicts = {"claim": [], "verdict": []}
#     print("Questions:",decomposed_questions)
#     for index, fact in enumerate(facts):
#         assert decomposed_questions.shape[0] == len(facts)
#         assert data[index]["claim"] == fact["claim"]
#         if decomposed_questions.iloc[index]["claims"] != fact["claim"]:
#             print(
#                 "not equal", decomposed_questions.iloc[index]["claims"], fact["claim"]
#             )

#         questions = decomposed_questions.iloc[index]["questions"]
#         questions = questions.lower().split("next question:")
#         print("questions", questions)
#         result = {"evidences": []}
#         result["claim"] = fact["claim"]
#         top_100_docs = data[index]["docs"]
#         print("top_100_docs", len(top_100_docs), len(list(set(top_100_docs))))
#         doc_embeddings = model.encode(top_100_docs)
#         top_k_docs = []
#         for question in questions:
#             print("question", question)
#             top_1_docs = get_top_k_similar_instances(
#                 question, doc_embeddings, top_100_docs, 1, 0.5
#             )
#             top_k_docs.extend(top_1_docs)
#         if len(top_k_docs) == 0 and len(questions) > 0:
#             top_k_docs = get_top_k_similar_instances(
#                 questions[0], doc_embeddings, top_100_docs, 1, 0.5
#             )
#         print(len(top_k_docs), len(list(set(top_k_docs))))
#         top_k_docs = list(set(top_k_docs))
#         questions = list(set(questions))
#         print("top_k_docs", len(top_k_docs))
#         verdicts["claim"].append(fact["claim"])
#         if len(top_k_docs) > 0:
#             for doc in top_k_docs:
#                 result["evidences"].append(doc)
#             input = (
#                 "[Claim]: "
#                 + fact["claim"]
#                 + "[Questions]: "
#                 + " ".join(questions)
#                 + "[Evidences]:"
#                 + " ".join(top_k_docs)
#             )
#             pred_label, _ = nli_model.predict(input, max_legnth=256)
#         elif len(top_k_docs) == 0:
#             print("No documents retrieved verifying claim directly")
#             pred_label, _ = nli_model.predict(fact["claim"])
#         # pred_label = pred_label if abs(probs[1]-probs[0]) > 0.2 else "NONE"
#         print("pred_label", pred_label)
#         if pred_label == "SUPPORTS":
#             verdict = "True"
#         elif pred_label == "REFUTES":
#             verdict = "False"
#         elif pred_label == "CONFLICTING":
#             verdict = "Conflicting"

#         print("Verdict:", verdict)
#         verdicts["verdict"].append(verdict)
#         results.append(result)
#         verdict_1 = pd.DataFrame(verdicts)
#         print(verdict_1)
#         output_path = config["output_path"]
#         verdict_1.to_csv(f"{output_path}.csv", index=False)
#         print(f"{fact['claim']}\t{fact['label']}\t{verdict}")
#         if verdict == fact["label"]:
#             matches += 1
#         else:
#             unmatches += 1
#         print("accuracy", matches / (matches + unmatches))
#         with open(f"{output_path}.json", "w") as f:
#             json.dump(results, f, indent=4, sort_keys=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]