# Project: Automated Fact Checking For Climate Science Claims

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Student Name: Hongda Zhu

Student ID: 1259524

This notebook builds a Automated Fact Checking System for 2023 COMP90042 Natural Language Processing Project.

Note:
1. Make sure all data file are loaded.
2. Change the Environment Variables block. Set the parameters to run.
3. Run eval.py to test the model. python eval.py --predictions output.json --groundtruth check_answers.json

In [None]:
import sys
print(sys.version)

import torch
print(torch.__version__)


3.8.16 (default, Mar  1 2023, 21:19:10) 
[Clang 14.0.6 ]
1.13.1


# Load datasets

In [2]:
import json

def load_data(filename):
    file = open(filename, 'r')
    data = json.load(file)
    file.close()
    return data

train_data = load_data("/content/drive/MyDrive/Colab Notebooks/train-claims.json")
dev_data = load_data("/content/drive/MyDrive/Colab Notebooks/dev-claims.json")
evidence_data = load_data("/content/drive/MyDrive/Colab Notebooks/evidence.json")
test_data = load_data("/content/drive/MyDrive/Colab Notebooks/test-claims-unlabelled.json")
# Merge train_data and dev_data to a larger dataset
merged_data = train_data.copy()  
merged_data.update(dev_data) 

print("Number of train_data: ", len(train_data))
print("Number of dev_data: ", len(dev_data))
print("Number of evidence_data: ", len(evidence_data))
print("Number of test_data: ", len(test_data))
print("Number of merged_data: ", len(merged_data))

Number of train_data:  1228
Number of dev_data:  154
Number of evidence_data:  1208827
Number of test_data:  153
Number of merged_data:  1382


**Environment variables**

You can modify the variable below to change the hyperparameters or the pretrained model.

In [28]:
import torch

k = 3 # Top k relevant evidences
max_length_token = 128 # The maximum token length
batch_size = 2 # Num of model training samples processed during a single iteration
model_type = "bert-base-uncased" # Set model to train. You can switch it to roberta-base or bert-base-uncased
epochs = 3 # Number of epochs to train
lr = 3e-5 # Model learning rate
device = torch.device("cpu") # Set cpu or cuda(gpu) to train the model.
use_merged_data = True # Use megred_data to train the model

# Preprocessing the data

Evidence retrieval

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# To retrieve top k relevant evidences for the claim
# fit_transform the evidence_text to TF-IDF matrix. transform claim_text to TF-IDF vector
# Calculate cosine_similarity on evidence_text and claim_text
# Return the top k relevant evidence_ids
evidence_texts = list(evidence_data.values())
vectorizer = TfidfVectorizer()
evidence_texts_tfidf_matrix = vectorizer.fit_transform(evidence_texts)

# Retrieve top-k evidence
def get_top_k_evidence(claim_text):
    claim_text_tfidf_vector = vectorizer.transform([claim_text])
    similarities = cosine_similarity(claim_text_tfidf_vector, evidence_texts_tfidf_matrix)
    # Get top k relevant evidence_ids
    top_k_indices = similarities.argsort()[0][-k:][::-1]
    top_k_evidence_ids = []
    for index in top_k_indices:
        evidence_id = list(evidence_data.keys())[index]
        top_k_evidence_ids.append(evidence_id)
    return top_k_evidence_ids

# for dataset in [train_data, dev_data]:
#     for claim_id, claim in dataset.items():
#         claim['evidences'] = get_top_k_evidence(claim['claim_text'])

Create PyTorch Dataset and DataLoader for claim_label

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_type)

# Mapping labels to numbers
label_mapping = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3,
}

class convertPytorchDataset(Dataset):
    def __init__(self, claim_data, evidence_data, tokenizer):
        self.claim_data = claim_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id, claim_data = list(self.claim_data.items())[idx]
        claim_text = claim_data["claim_text"]
        claim_label = claim_data.get("claim_label", None)

        # Retrieve top k evidences for the claim
        evidence_ids = get_top_k_evidence(claim_text)
        # Concatenate the claim text with the top-k evidence texts
        evidence_texts = " "
        # Get the corresponding evidence text from the evidence_data
        for eid in evidence_ids:
            evidence_text = evidence_data[eid]
            evidence_texts += evidence_text + " "
        claim_and_evidence_text = claim_text + evidence_texts.rstrip()

        # Tokenize claim and evidence
        # encode_plus can add special tokens like [CLS], [SEP], and do truncation and padding to the token
        # Converts the tokens to a data form that model can process
        tokens = self.tokenizer.encode_plus(claim_and_evidence_text, truncation=True, padding="max_length", max_length=max_length_token)

        # Convert label to a number if available
        if claim_label is not None:
            label_num = label_mapping[claim_label]
        else:
            label_num = -1

        item = {
            "input_ids": torch.tensor(tokens["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(tokens["attention_mask"], dtype=torch.long),
            "claim_id": claim_id,
            "claim_text": claim_text,
            "labels": torch.tensor(label_num, dtype=torch.long),
            "evidences": evidence_ids
        }
        return item


# Packing data into batch for creating dataloader
def collate_fn(batch):
    batch_list = []
    for b in batch:
        if b is not None:
            batch_list.append(b)
    if len(batch_list) == 0:
        return None

    max_len = 0
    for b in batch_list:
        current_len = b["input_ids"].shape[0]
        if current_len > max_len:
            max_len = current_len

    # Make sure each tensor is in the same shape, fill them with 0 if not consistent. 
    # Fill them up to the longest sequence in this batch.
    input_ids = torch.zeros(len(batch_list), max_len, dtype=torch.long)
    attention_mask = torch.zeros(len(batch_list), max_len, dtype=torch.long)
    labels = torch.zeros(len(batch_list), dtype=torch.long)
    claim_ids = []
    claim_texts = []
    evidences = []

    for i in range(len(batch_list)):
        b = batch_list[i]
        cur_len = b["input_ids"].shape[0]
        for j in range(cur_len):
            input_ids[i, j] = b["input_ids"][j]
            attention_mask[i, j] = b["attention_mask"][j]
        labels[i] = b["labels"]
        claim_ids.append(b["claim_id"])
        claim_texts.append(b["claim_text"])
        evidences.append(b["evidences"])

    return {"input_ids": input_ids, "attention_mask": attention_mask, "claim_id": claim_ids, "claim_text": claim_texts, "claim_label": labels, "evidences": evidences}


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Training the model

In [31]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad() # Resets the gradients from the previous step to prevent accumulation.

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["claim_label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward() # Calculates the gradients for each parameter
        optimizer.step() # Update parameters based on gradients
        scheduler.step() # Update learning rate

        total_loss += loss.item()
    # Calculate the average loss 
    avg_loss = total_loss / len(dataloader)
    return avg_loss

num_labels = len(label_mapping)
# If useMergedData = True, use merged_data to train the model
if use_merged_data:
    train_data = merged_data.copy()
    
train_dataset = convertPytorchDataset(train_data, evidence_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Create the model
claim_classification_model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)
claim_classification_model.to(device)
# To adjust the parameters of model during training
optimizer = AdamW(claim_classification_model.parameters(), lr=lr)
# To adjust learning rate during training
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

for epoch in range(epochs):
    print(f"Training Epoch {epoch+1}/{epochs}")
    train_loss = train_epoch(claim_classification_model, train_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training Epoch 1/3


Training: 100%|██████████| 691/691 [30:38<00:00,  2.66s/it]


Train Loss: 1.2731
Training Epoch 2/3


Training: 100%|██████████| 691/691 [26:17<00:00,  2.28s/it]


Train Loss: 1.2276
Training Epoch 3/3


Training: 100%|██████████| 691/691 [26:09<00:00,  2.27s/it]

Train Loss: 0.9650





# Prediction on test data

In [32]:
labelNum_mapping = {
    0: "SUPPORTS",
    1: "REFUTES",
    2: "NOT_ENOUGH_INFO",
    3: "DISPUTED",
}

# Prediction function for both dev and test data
def save_predictions(claim_classification_model, dataloader, output_file, has_labels=True):
    claim_classification_model.eval()
    predictions = {} # Store prediction output

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        claim_ids = batch["claim_id"]
        claim_texts = batch["claim_text"]
        evidences = batch["evidences"]
        
        with torch.no_grad(): # Disable gradient computation to save memory
            # Get prediction result
            outputs = claim_classification_model(input_ids, attention_mask=attention_mask)
            # Get the most relevant result, and return as a list (1 dimension)
            predicted_values, predicted_indices = torch.max(outputs.logits, 1)

        for i, claim_id in enumerate(claim_ids):
            claim_text = claim_texts[i]
            top_evidences = evidences[i]

            predictions[claim_id] = {
                "claim_text": claim_text,
                "claim_label": labelNum_mapping[predicted_indices[i].item()],
                "evidences": top_evidences
            }

    with open(output_file, "w") as outfile:
        json.dump(predictions, outfile, indent=2)

# Predictions on dev data
dev_dataset = convertPytorchDataset(dev_data, evidence_data, tokenizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
save_predictions(claim_classification_model, dev_dataloader, "dev_claims_out.json")

# Predict on test data
test_dataset = convertPytorchDataset(test_data, evidence_data, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
save_predictions(claim_classification_model, test_dataloader, "test-claims-predictions.json", has_labels=False)