In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from rank_bm25 import BM25Okapi
import numpy as np
import nltk
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer 
import heapq
import pandas as pd
from ast import literal_eval

In [None]:
# # A dependency of the preprocessing for BERT inputs
# !pip install -q -U "tensorflow-text==2.11.*"
# !pip install -U "tensorflow==2.11.*"
# !pip install -q tf-models-official==2.11.0
!pip install torch torchvision transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/test-claims-unlabelled.json', orient='index')
# df = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/train-claims.json', orient='index')
df.reset_index(inplace=True)
df.rename(columns={"index":"claim_id"},inplace=True)

evidence_data = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/evidence.json', orient = 'index')
evidence_data.columns = ['evidence_text']
evidence_data.reset_index(inplace=True)
evidence_data.rename(columns={"index":"evidence_id"},inplace=True)

In [None]:
evidence_list = evidence_data.evidence_text.values.tolist()

In [None]:
print(len(evidence_list))

28951


In [None]:
tokenized_corpus = [doc.split(" ") for doc in evidence_list]

In [None]:
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
new_test_data = pd.DataFrame(columns=['claim_id','claim_text','evidences'])
for index, row in df.iterrows():
  claim_id = row['claim_id']
  claim = df.loc[index, 'claim_text']
  tokenized_query = claim.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)
  result = bm25.get_top_n(tokenized_query, evidence_list, n=7)
  for i in range(7):
    evidence = result[i]
    evidence_id = evidence_data.loc[evidence_data['evidence_text'] == evidence].evidence_id.values[0]
    new_row = {'claim_id':claim_id,'claim_text': claim ,'evidences':evidence_id}
    new_test_data.loc[len(new_test_data)] = new_row

In [None]:
print(doc_scores)

[ 0.37743993  2.70600119 10.81267953 ...  6.25689944  5.92141801
 13.49597297]


In [None]:
print(new_test_data.head())

     claim_id                                         claim_text  \
0  claim-1001  ‘This study goes beyond statistical correlatio...   
1  claim-1001  ‘This study goes beyond statistical correlatio...   
2  claim-1001  ‘This study goes beyond statistical correlatio...   
3  claim-1001  ‘This study goes beyond statistical correlatio...   
4  claim-1001  ‘This study goes beyond statistical correlatio...   

         evidences  
0  evidence-368618  
1  evidence-399850  
2  evidence-929308  
3  evidence-277435  
4   evidence-78930  


In [None]:
training_data = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/train-claims.json', orient='index')
training_data = training_data.assign(evidences = training_data['evidences']).explode('evidences')
training_data.reset_index(inplace=True)
training_data.rename(columns={"index":"claim_id"},inplace=True)

In [None]:
dev_data = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/dev-claims.json', orient='index')
dev_data = dev_data.assign(evidences = dev_data['evidences']).explode('evidences')
dev_data.reset_index(inplace=True)
dev_data.rename(columns={"index":"claim_id"},inplace=True)

In [None]:
full_evidence_data = pd.read_json('/content/drive/MyDrive/Sem 2/NLP /Assignment 3/project-data/evidence.json', orient = 'index')
full_evidence_data.columns = ['evidence_text']
full_evidence_data.index.name = 'evidence_id'

In [None]:
#load pretrained bert base model
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-large-cased',return_dict=True)

print("Done loading BERT model.")

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done loading BERT model.


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import tensorflow as tf
class ClaimDataset(Dataset):

  def __init__(self, training_data, test_data, full_evidence_data, maxlen, mode='train'):

    if mode != 'train' and mode != 'test':
      raise ValueError("mode should either be train or test")
    self.df = training_data
    self.evidence_data = full_evidence_data

    if mode == 'train':
      self.df = training_data
      possible_labels = self.df.claim_label.unique()
      label_dict = {'SUPPORTS': 0, 'NOT_ENOUGH_INFO': 1, 'REFUTES': 2, 'DISPUTED': 3}
      self.df['label'] = self.df['claim_label'].map(label_dict)

    elif mode == 'test':
      
      self.df = test_data
      self.df['claim_label'] = 'NO_LABEL'
      self.df['label'] = 1

    # print(self.df.head())
    #Initialize the BERT tokenizer
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    self.maxlen = maxlen

  def __len__(self):
      return len(self.df)

  def __getitem__(self, index):
    #Selecting the claim text and claim label at the specified index in the data frame
    claim = self.df.loc[index, 'claim_text']
    evidence_id = self.df.loc[index, 'evidences']
    evidence = self.evidence_data.loc[evidence_id, 'evidence_text']

    
    # label = self.df.label.values
    label = self.df.loc[index, 'label']

    # #Preprocessing the text to be suitable for BERT
    tokens = ['[CLS]'] + self.tokenizer.tokenize(claim) + ['[SEP]']
    segment_ids = [0] * len(tokens)
    if evidence:
      tokens += self.tokenizer.tokenize(evidence)  + ['[SEP]']
      segment_ids += [1] * (len(self.tokenizer.tokenize(evidence)) + 1)

    if len(tokens) < self.maxlen:
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
      segment_ids = segment_ids + [0 for _ in range(self.maxlen - len(segment_ids))]
    else:
      tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length
      segment_ids = segment_ids[:self.maxlen-1] + [0]

    tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
    tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor
    segment_ids_tensor = torch.tensor(segment_ids)
    labels_tensor = tf.keras.utils.to_categorical(label, num_classes=4)

    # Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
    attn_mask = (tokens_ids_tensor != 0).long()
    return tokens_ids_tensor, segment_ids_tensor, attn_mask, labels_tensor

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a se ntence can have
#any sentence longer than this length is truncated to the maxlen size

training_data_set = ClaimDataset(training_data, new_test_data, full_evidence_data, maxlen = 50, mode='train')
dev_data_set = ClaimDataset(dev_data, new_test_data, full_evidence_data, maxlen = 50, mode='train')

#Creating intsances of training and development dataloaders
train_loader = DataLoader(training_data_set, batch_size = 32, num_workers = 2)
dev_loader = DataLoader(dev_data_set, batch_size = 32, num_workers = 2)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [None]:
import torch.nn as nn
from transformers import BertModel, BertForSequenceClassification, AdamW, BertConfig,AutoModelForSequenceClassification
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
labels = training_data.claim_label.unique()
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 4,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)
# Tell pytorch to run this model on the GPU.
model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# AdamW is an optimizer which is a Adam Optimzier with weight-decay-fix
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )




In [None]:
gpu = 0 #gpu ID
# print("Creating the evidence classifier, initialised with pretrained BERT-BASE parameters...")
# net = EvidenceClassifier()
# net.cuda(gpu)
# print("Done creating Evidence classifier")

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
from scipy.special import softmax
def flat_accuracy(preds, labels):
    softmax_pred = softmax(preds)
    new_preds = (softmax_pred == softmax_pred.max(axis=1, keepdims=1)).astype(float)
    pred_flat = new_preds.flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
import numpy as np
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda")

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  t0 = time.time()
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(train_loader):
    b_input_ids = batch[0].to(device)
    b_segment_ids = batch[1].to(device)
    b_input_mask = batch[2].to(device)
    b_labels = batch[3].to(device)

    model.zero_grad()

    output = model(b_input_ids, token_type_ids=b_segment_ids, attention_mask=b_input_mask, labels=b_labels)
    loss = output.loss
    logits = output.logits
    # print(output)


    total_train_loss += loss.item()
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    if step % 25 == 0 and step != 0:
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      accuracy= flat_accuracy(logits, label_ids)
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}: Accuracy: {:>5,}   Loss: {:>5,}   Time Elapsed: {:}.'.format(step, len(train_loader), accuracy, loss.item(), elapsed))
  
  avg_train_loss = total_train_loss / len(train_loader)
  training_time = format_time(time.time() - t0)

  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))


  print("Running Validation...")

  t0 = time.time()
  model.eval()

  # Tracking variables 
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  for batch in dev_loader:
    b_input_ids = batch[0].to(device)
    b_segment_ids = batch[1].to(device)
    b_input_mask = batch[2].to(device)
    b_labels = batch[3].to(device)

    with torch.no_grad():
      output = model(b_input_ids, token_type_ids=b_segment_ids, attention_mask=b_input_mask, labels=b_labels)
      loss = output.loss
      logits = output.logits
    
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    total_eval_accuracy += flat_accuracy(logits, label_ids)
  
  avg_val_accuracy = total_eval_accuracy / len(dev_loader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  avg_val_loss = total_eval_loss / len(dev_loader)

  validation_time = format_time(time.time() - t0)

  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))

  # Record all statistics from this epoch.
  training_stats.append(
      {
          'epoch': epoch_i + 1,
          'Training Loss': avg_train_loss,
          'Valid. Loss': avg_val_loss,
          'Valid. Accur.': avg_val_accuracy,
          'Training Time': training_time,
          'Validation Time': validation_time
      }
  )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
  



  Batch    25  of    129: Accuracy: 0.6875   Loss: 0.5435417890548706   Time Elapsed: 0:00:09.
  Batch    50  of    129: Accuracy: 0.625   Loss: 0.5630084276199341   Time Elapsed: 0:00:15.
  Batch    75  of    129: Accuracy: 0.640625   Loss: 0.5738755464553833   Time Elapsed: 0:00:22.
  Batch   100  of    129: Accuracy: 0.8125   Loss: 0.39551132917404175   Time Elapsed: 0:00:28.
  Batch   125  of    129: Accuracy:  0.75   Loss: 0.5275835990905762   Time Elapsed: 0:00:35.

  Average training loss: 0.52
  Training epcoh took: 0:00:36
Running Validation...
  Accuracy: 0.73
  Validation Loss: 0.50
  Validation took: 0:00:02
  Batch    25  of    129: Accuracy: 0.6875   Loss: 0.5153732299804688   Time Elapsed: 0:00:07.
  Batch    50  of    129: Accuracy: 0.625   Loss: 0.5563769340515137   Time Elapsed: 0:00:14.
  Batch    75  of    129: Accuracy: 0.765625   Loss: 0.5358949899673462   Time Elapsed: 0:00:20.
  Batch   100  of    129: Accuracy: 0.921875   Loss: 0.31677091121673584   Time Elapse

In [None]:
# TESTING PHASE
test_data_set = ClaimDataset(training_data, new_test_data, full_evidence_data, maxlen = 50, mode='test')

#Creating intsances of training and development dataloaders
test_loader = DataLoader(test_data_set, batch_size = 32, num_workers = 2)

In [None]:
model.eval()

test_prediction = []
label_dict = {0:'SUPPORTS', 1:'NOT_ENOUGH_INFO', 2:'REFUTES', 3:'DISPUTED'}

for batch in test_loader:
  b_input_ids = batch[0].to(device)
  b_segment_ids = batch[1].to(device)
  b_input_mask = batch[2].to(device)
  #    Dummy label values as we don't have ground truth
  b_labels = batch[3].to(device)

  with torch.no_grad():
    output = model(b_input_ids, token_type_ids=b_segment_ids, attention_mask=b_input_mask, labels=b_labels)
    logits = output.logits

    # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  softmax_pred = softmax(logits)
  new_preds = (softmax_pred == softmax_pred.max(axis=1, keepdims=1)).astype(float)

  for one_hot_array in new_preds:
    label = label_dict.get(np.argmax(one_hot_array), 'NOT_ENOUGH_INFO')
    test_prediction.append(label)

  

In [None]:
print(test_prediction)
print(len(test_prediction))
print(len(test_data_set))

['NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_I

In [None]:
print(len(new_test_data))
new_test_data['claim_label'] = test_prediction
print(new_test_data.head())

2295
     claim_id                                         claim_text  \
0  claim-1001  ‘This study goes beyond statistical correlatio...   
1  claim-1001  ‘This study goes beyond statistical correlatio...   
2  claim-1001  ‘This study goes beyond statistical correlatio...   
3  claim-1001  ‘This study goes beyond statistical correlatio...   
4  claim-1001  ‘This study goes beyond statistical correlatio...   

         evidences      claim_label  label  
0  evidence-368618  NOT_ENOUGH_INFO      1  
1  evidence-399850  NOT_ENOUGH_INFO      1  
2  evidence-929308  NOT_ENOUGH_INFO      1  
3  evidence-277435         SUPPORTS      1  
4   evidence-78930         SUPPORTS      1  


In [None]:
# test_json_df = new_test_data.groupby(['claim_id']).agg({'claim_text': 'first','claim_label': lambda x: x.mode().iloc[0],'evidences': lambda x: list(x)})

In [None]:
claim_id_list = new_test_data['claim_id'].unique()
test_json_df = pd.DataFrame(columns=['claim_id','claim_text','evidences','claim_label'])
for id in claim_id_list:
  claim = new_test_data.loc[new_test_data.claim_id== id, 'claim_text'].values[0]
  evidences = new_test_data.loc[new_test_data.claim_id== id, 'evidences'].values
  labels = new_test_data.loc[new_test_data.claim_id== id, 'claim_label'].values
  if 'SUPPORTS' in labels:
    label = 'SUPPORTS'
  elif 'REFUTES' in labels:
    label = 'REFUTES'
  elif 'DISPUTED' in labels:
    label = 'DISPUTED'
  else:
    label = 'NOT_ENOUGH_INFO'

  new_row = {'claim_id':id,'claim_text': claim ,'evidences':evidences,'claim_label':label}
  test_json_df.loc[len(test_json_df)] = new_row

In [None]:
test_json_df = test_json_df.set_index('claim_id')

In [None]:
print(test_json_df)

                                                   claim_text  \
claim_id                                                        
claim-1001  ‘This study goes beyond statistical correlatio...   
claim-1003  A recent study in Nature Geoscience, for insta...   
claim-1009  ‘Arctic ice conditions have been tracking at r...   
claim-1020  “The global reef crisis does not necessarily m...   
claim-1028  A second coat of paint has much less of an eff...   
...                                                       ...   
claim-910   The cement, iron and steel, and petroleum refi...   
claim-942   ‘We could be decades too fast, or decades too ...   
claim-952   The Alaskan tundra is warming so quickly it ha...   
claim-972   “Arctic land stores about twice as much carbon...   
claim-979   “Warm weather worsened the most recent five-ye...   

                                                    evidences      claim_label  
claim_id                                                                 

In [None]:
# TO JSON 
json_result = test_json_df.to_json(orient='index')

In [None]:
from google.colab import files
with open('test-claims-predictions.json', 'w') as f:
  f.write(str(json_result))

files.download('test-claims-predictions.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>