# Project: Automated Fact Checking For Climate Science Claims

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Student Name: Hongda Zhu

Student ID: 1259524

In [2]:
import sys
print(sys.version)

import torch
print(torch.__version__)


3.10.11 (main, Apr  5 2023, 14:15:10) [GCC 9.4.0]
2.0.0+cu118


# Load datasets

In [3]:
import json

def load_data(filename):
    file = open(filename, 'r')
    data = json.load(file)
    file.close()
    return data

train_data = load_data("/content/drive/MyDrive/Colab Notebooks/train-claims.json")
dev_data = load_data("/content/drive/MyDrive/Colab Notebooks/dev-claims.json")
evidence_data = load_data("/content/drive/MyDrive/Colab Notebooks/evidence.json")
test_data = load_data("/content/drive/MyDrive/Colab Notebooks/test-claims-unlabelled.json")

print("Number of train_data: ", len(train_data))
print("Number of dev_data: ", len(dev_data))
print("Number of evidence_data: ", len(evidence_data))
print("Number of test_data: ", len(test_data))

# print("train_data: ", train_data)
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)
# for id, info in train_data.items():
#     print("claim_id: ", id)
#     print("claim_text: ", info['claim_text'])
#     print("claim_label: ", info['claim_label'])
#     print("evidences: ", info['evidences'])
#     print()
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)

Number of train_data:  1228
Number of dev_data:  154
Number of evidence_data:  1208827
Number of test_data:  153


**Environment variables**

You can modify the variable below to change the hyperparameters or the pretrained model.

In [4]:
import torch

k = 5 # Top k relevant evidences
max_length_token = 128 # The maximum token length
batch_size = 2 # 
model_type = "bert-base-uncased" # Set model to train
epochs = 3 # Number of epochs to train
lr = 3e-5 # Model learning rate
device = torch.device("cpu") # Set cpu or cuda(gpu) to train the model.

# Preprocessing the data

Evidence retrieval

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

evidence_texts = list(evidence_data.values())
vectorizer = TfidfVectorizer(stop_words='english')
evidence_tfidf_matrix = vectorizer.fit_transform(evidence_texts)

# Retrieve top-k evidence
def get_top_k_evidence(claim):
    claim_tfidf_vector = vectorizer.transform([claim])
    similarities = cosine_similarity(claim_tfidf_vector, evidence_tfidf_matrix)
    top_k_indices = similarities.argsort()[0][-k:][::-1]
    top_k_evidence_ids = [list(evidence_data.keys())[index] for index in top_k_indices]
    return top_k_evidence_ids

# for dataset in [train_data, dev_data]:
#     for claim_id, claim in dataset.items():
#         claim['evidences'] = get_top_k_evidence(claim['claim_text'])

Create PyTorch Dataset and DataLoader for claim_label

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_type)

class convertPytorchDataset(Dataset):
    def __init__(self, claim_data, evidence_data, tokenizer, label_mapping):
        self.claim_data = claim_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer
        self.label_mapping = label_mapping

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id, claim_data = list(self.claim_data.items())[idx]
        claim_text = claim_data["claim_text"]
        claim_label = claim_data.get("claim_label", None)

        # Retrieve top k evidences for the claim
        evidence_ids = get_top_k_evidence(claim_text)

        
        # Concatenate the claim text with the top-k evidence texts
        claim_and_evidence_text = claim_text + " ".join([evidence_data[eid] for eid in evidence_ids])
        # Tokenize claim and evidence
        tokens = self.tokenizer.encode_plus(claim_and_evidence_text, truncation=True, padding="max_length", max_length=max_length_token)

        # Convert label to a number if available
        if claim_label is not None:
            label = self.label_mapping[claim_label]
        else:
            label = -1

        item = {
            "input_ids": torch.tensor(tokens["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(tokens["attention_mask"], dtype=torch.long),
            "claim_id": claim_id,
            "claim_text": claim_text,
            "labels": torch.tensor(label, dtype=torch.long),
            "evidences": evidence_ids
        }

        return item


# Convert labels to numbers
label_mapping = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3,
}

train_dataset = convertPytorchDataset(train_data, evidence_data, tokenizer, label_mapping)
dev_dataset = convertPytorchDataset(dev_data, evidence_data, tokenizer, label_mapping)

def collate_fn(batch):
    non_empty_batch = [x for x in batch if x is not None]
    if len(non_empty_batch) == 0:
        return None

    max_len = max([x["input_ids"].shape[0] for x in non_empty_batch])
    input_ids = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    attention_mask = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    labels = torch.zeros(len(non_empty_batch), dtype=torch.long)
    claim_ids = []
    claim_texts = []
    evidences = []

    for i, x in enumerate(non_empty_batch):
        cur_len = x["input_ids"].shape[0]
        input_ids[i, :cur_len] = x["input_ids"]
        attention_mask[i, :cur_len] = x["attention_mask"]
        labels[i] = x["labels"]
        claim_ids.append(x["claim_id"])
        claim_texts.append(x["claim_text"])
        evidences.append(x["evidences"])

    return {"input_ids": input_ids, "attention_mask": attention_mask, "claim_id": claim_ids, "claim_text": claim_texts, "labels": labels, "evidences": evidences}



train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Training the model

In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

num_labels = len(label_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Training the claim classification model
claim_classification_model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)
claim_classification_model.to(device)

optimizer = AdamW(claim_classification_model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs} (Claim Classification)")
    train_loss = train_epoch(claim_classification_model, train_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3 (Claim Classification)


Training: 100%|██████████| 614/614 [17:31<00:00,  1.71s/it]


Train Loss: 1.2725
Epoch 2/3 (Claim Classification)


Training: 100%|██████████| 614/614 [17:31<00:00,  1.71s/it]


Train Loss: 1.1386
Epoch 3/3 (Claim Classification)


Training: 100%|██████████| 614/614 [17:27<00:00,  1.71s/it]

Train Loss: 0.7053





# Prediction on test data

In [8]:
label_mapping_inverse = {v: k for k, v in label_mapping.items()}
# Prediction function for both dev and test data
def save_predictions(claim_classification_model, dataloader, output_file, has_labels=True):
    claim_classification_model.eval()
    predictions = {}

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        claim_ids = batch["claim_id"]
        claim_texts = batch["claim_text"]
        evidences = batch["evidences"]

        with torch.no_grad():
            outputs = claim_classification_model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

        for idx, claim_id in enumerate(claim_ids):
            claim_text = claim_texts[idx]
            top_evidence = evidences[idx]

            predictions[claim_id] = {
                "claim_text": claim_text,
                "claim_label": label_mapping_inverse[predicted[idx].item()],
                "evidences": top_evidence
            }

    with open(output_file, "w") as outfile:
        json.dump(predictions, outfile, indent=2)

# Save predictions on dev data
save_predictions(claim_classification_model, dev_dataloader, "/content/drive/MyDrive/Colab Notebooks/dev_claims_out.json")

# Save predictions on test data
test_dataset = convertPytorchDataset(test_data, evidence_data, tokenizer, label_mapping)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
save_predictions(claim_classification_model, test_dataloader, "/content/drive/MyDrive/Colab Notebooks/test-claims-predictions.json", has_labels=False)