# Project: Automated Fact Checking For Climate Science Claims

Student Name: Hongda Zhu

Student ID: 1259524

In [3]:
import sys
print(sys.version)

import torch
print(torch.__version__)


3.8.16 (default, Mar  1 2023, 21:19:10) 
[Clang 14.0.6 ]
1.13.1


# Load datasets

In [4]:
import json

def load_data(filename):
    file = open(filename, 'r')
    data = json.load(file)
    file.close()
    return data

train_data = load_data("train-claims.json")
dev_data = load_data("dev-claims.json")
evidence_data = load_data("evidence.json")
test_data = load_data("test-claims-unlabelled.json")

print("Number of train_data: ", len(train_data))
print("Number of dev_data: ", len(dev_data))
print("Number of evidence_data: ", len(evidence_data))
print("Number of test_data: ", len(test_data))

# print("train_data: ", train_data)
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)
# for id, info in train_data.items():
#     print("claim_id: ", id)
#     print("claim_text: ", info['claim_text'])
#     print("claim_label: ", info['claim_label'])
#     print("evidences: ", info['evidences'])
#     print()
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)

Number of train_data:  1228
Number of dev_data:  154
Number of evidence_data:  1208827
Number of test_data:  153


# Preprocessing the data

Evidence retrieval

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Retrieve top-k evidence
def retrieve_top_k_evidence(claim, evidence_data, k=5):
    evidence_ids = list(evidence_data.keys())
    evidence_texts = list(evidence_data.values())
    
    vectorizer = TfidfVectorizer(stop_words='english')
    evidence_matrix = vectorizer.fit_transform(evidence_texts)
    claim_vector = vectorizer.transform([claim])
    
    similarities = cosine_similarity(claim_vector, evidence_matrix)
    sorted_indices = similarities.argsort().flatten()[::-1]
    
    top_k_evidence_ids = [evidence_ids[i] for i in sorted_indices[:k]]
    return top_k_evidence_ids


Constant variables

In [6]:
max_length_token = 128 # The maximum token length
batch_size = 2 
model_type = "prajjwal1/bert-mini"
device = torch.device("cpu")

Create PyTorch Dataset and DataLoader for claim_label

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_type)

class convertPytorchDataset(Dataset):
    def __init__(self, claim_data, evidence_data, tokenizer, label_mapping):
        self.claim_data = claim_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer
        self.label_mapping = label_mapping

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id, claim_data = list(self.claim_data.items())[idx]
        claim_text = claim_data["claim_text"]
        claim_label = claim_data.get("claim_label", None)

        # Retrieve all evidence for the claim
        evidence_ids = claim_data.get("evidence", [])

        # Tokenize claim and evidence
        tokens = self.tokenizer.encode_plus(claim_text, truncation=True, padding="max_length", max_length=max_length_token)

        # Convert label to a number if available
        if claim_label is not None:
            label = self.label_mapping[claim_label]
        else:
            label = -1

        item = {
            "input_ids": torch.tensor(tokens["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(tokens["attention_mask"], dtype=torch.long),
            "claim_id": claim_id,
            "claim_text": claim_text,
            "labels": torch.tensor(label, dtype=torch.long),
            "evidences": evidence_ids
        }

        return item


# Convert labels to numbers
label_mapping = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3,
}

train_dataset = convertPytorchDataset(train_data, evidence_data, tokenizer, label_mapping)
dev_dataset = convertPytorchDataset(dev_data, evidence_data, tokenizer, label_mapping)

def collate_fn(batch):
    non_empty_batch = [x for x in batch if x is not None]
    if len(non_empty_batch) == 0:
        return None

    max_len = max([x["input_ids"].shape[0] for x in non_empty_batch])
    input_ids = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    attention_mask = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    labels = torch.zeros(len(non_empty_batch), dtype=torch.long)
    claim_ids = []
    claim_texts = []
    evidences = []

    for i, x in enumerate(non_empty_batch):
        cur_len = x["input_ids"].shape[0]
        input_ids[i, :cur_len] = x["input_ids"]
        attention_mask[i, :cur_len] = x["attention_mask"]
        labels[i] = x["labels"]
        claim_ids.append(x["claim_id"])
        claim_texts.append(x["claim_text"])
        evidences.append(x["evidences"])

    return {"input_ids": input_ids, "attention_mask": attention_mask, "claim_id": claim_ids, "claim_text": claim_texts, "labels": labels, "evidences": evidences}



train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


Create PyTorch dataset and dataloader for evidences

In [8]:
import random

class EvidenceRetrievalDataset(Dataset):
    def __init__(self, claim_data, evidence_data, tokenizer, num_negative_samples=2):
        self.claim_data = claim_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer
        self.num_negative_samples = num_negative_samples
        self.data = self.create_retrieval_data()

    def create_retrieval_data(self):
        retrieval_data = []

        for claim_id, claim_info in self.claim_data.items():
            claim_text = claim_info["claim_text"]
            relevant_evidences = set(claim_info.get("evidence", []))

            negative_samples = set(self.evidence_data.keys()) - relevant_evidences
            negative_samples = random.sample(negative_samples, min(self.num_negative_samples, len(negative_samples)))

            for evidence_id in relevant_evidences:
                retrieval_data.append({
                    "claim_text": claim_text,
                    "evidence_text": self.evidence_data[evidence_id],
                    "label": 1
                })

            for evidence_id in negative_samples:
                retrieval_data.append({
                    "claim_text": claim_text,
                    "evidence_text": self.evidence_data[evidence_id],
                    "label": 0
                })

        return retrieval_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        claim_text = example["claim_text"]
        evidence_text = example["evidence_text"]
        label = example["label"]

        tokens = self.tokenizer.encode_plus(claim_text, evidence_text, truncation=True, padding="max_length", max_length=max_length_token)

        item = {
            "input_ids": torch.tensor(tokens["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(tokens["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

        return item

train_evidence_dataset = EvidenceRetrievalDataset(train_data, evidence_data, tokenizer)
dev_evidence_dataset = EvidenceRetrievalDataset(dev_data, evidence_data, tokenizer)

def evidence_collate_fn(batch):
    non_empty_batch = [x for x in batch if x is not None]
    if len(non_empty_batch) == 0:
        return None

    max_len = max([x["input_ids"].shape[0] for x in non_empty_batch])
    input_ids = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    attention_mask = torch.zeros(len(non_empty_batch), max_len, dtype=torch.long)
    labels = torch.zeros(len(non_empty_batch), dtype=torch.long)

    for i, x in enumerate(non_empty_batch):
        cur_len = x["input_ids"].shape[0]
        input_ids[i, :cur_len] = x["input_ids"]
        attention_mask[i, :cur_len] = x["attention_mask"]
        labels[i] = x["labels"]

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_evidence_dataloader = DataLoader(train_evidence_dataset, batch_size=batch_size, shuffle=True, collate_fn=evidence_collate_fn)
dev_evidence_dataloader = DataLoader(dev_evidence_dataset, batch_size=batch_size, shuffle=False, collate_fn=evidence_collate_fn)


# Training the model

Fine-tune hyperparameter

In [None]:
search_space = {
    'learning_rate': [1e-4, 3e-5, 1e-5],
    'batch_size': [8, 16, 32],
    'epochs': [2, 3, 4]
}

def train_and_evaluate(params):
    batch_size = params['batch_size']
    lr = params['learning_rate']
    epochs = params['epochs']

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model = BertForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)

    val_accuracy = evaluate(model, dev_dataloader, device)

    return val_accuracy


# import itertools

# best_val_accuracy = 0
# best_params = None

# for lr, batch_size, epochs in itertools.product(search_space['learning_rate'], search_space['batch_size'], search_space['epochs']):
#     params = {'learning_rate': lr, 'batch_size': batch_size, 'epochs': epochs}
#     print(f"Trying parameters: {params}")
#     val_accuracy = train_and_evaluate(params)
#     print(f"Validation Accuracy: {val_accuracy:.4f}")

#     if val_accuracy > best_val_accuracy:
#         best_val_accuracy = val_accuracy
#         best_params = params

# print(f"Best hyperparameters: {best_params}")
# print(f"Best validation accuracy: {best_val_accuracy:.4f}")



In [9]:
from transformers import AutoModelForSequenceClassification

num_labels = len(label_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)

model.to(device)


from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 3
lr = 3e-5

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# for epoch in range(epochs):
#     print(f"Epoch {epoch+1}/{epochs}")
#     train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
#     print(f"Train Loss: {train_loss:.4f}")


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.2.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'vocab_transform.bias', 'distilbert.transformer.layer.2.ffn.lin2.weight', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.2.attention.out_lin.bias', 'distilbert.transformer.layer.1.output_layer_norm.bias', 'vocab_projector.bias', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.1.attention.out_lin.bias', 'distilbert.embeddings.position_embeddings.weight', 'voca

Train label model

In [10]:
from transformers import AutoModelForSequenceClassification

# Training the claim classification model
claim_classification_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
claim_classification_model.to(device)

optimizer = AdamW(claim_classification_model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs} (Claim Classification)")
    train_loss = train_epoch(claim_classification_model, train_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")
claim_classification_model.to("cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3 (Claim Classification)


Training: 100%|██████████| 614/614 [19:29<00:00,  1.91s/it]


Train Loss: 1.2541
Epoch 2/3 (Claim Classification)


Training: 100%|██████████| 614/614 [18:35<00:00,  1.82s/it]


Train Loss: 1.0685
Epoch 3/3 (Claim Classification)


Training: 100%|██████████| 614/614 [18:22<00:00,  1.80s/it]

Train Loss: 0.6474





BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
import torch

# Clear the GPU cache
torch.cuda.empty_cache()

from tensorflow.keras import backend as K

# Clear the GPU memory
K.clear_session()

Train evidence model

In [14]:
# Training the evidence retrieval model
evidence_retrieval_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
evidence_retrieval_model.to(device)

optimizer = AdamW(evidence_retrieval_model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_evidence_dataloader) * epochs)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs} (Evidence Retrieval)")
    train_loss = train_epoch(evidence_retrieval_model, train_evidence_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3 (Evidence Retrieval)


Training: 100%|██████████| 1228/1228 [38:25<00:00,  1.88s/it]


Train Loss: 0.0030
Epoch 2/3 (Evidence Retrieval)


Training: 100%|██████████| 1228/1228 [39:18<00:00,  1.92s/it]


Train Loss: 0.0000
Epoch 3/3 (Evidence Retrieval)


Training:  28%|██▊       | 343/1228 [10:53<29:38,  2.01s/it]

Free GPU memory

In [10]:
import torch

# Clear the GPU cache
torch.cuda.empty_cache()

from tensorflow.keras import backend as K

# Clear the GPU memory
K.clear_session()

# Prediction on dev data

In [12]:
label_mapping_inverse = {v: k for k, v in label_mapping.items()}

def save_predictions(model, dataloader, output_file):
    model.eval()
    predictions = {}

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        claim_ids = batch["claim_id"]
        claim_texts = batch["claim_text"]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

        for idx, claim_id in enumerate(claim_ids):
            predictions[claim_id] = {
                "claim_text": claim_texts[idx],
                "claim_label": label_mapping_inverse[predicted[idx].item()],
                "evidences": batch["evidences"][idx]
            }

    with open(output_file, "w") as outfile:
        json.dump(predictions, outfile, indent=2)

save_predictions(model, dev_dataloader, "dev_claims_out.json")



# Prediction on test data

In [13]:
# Prediction function for both dev and test data
def save_predictions(model, dataloader, output_file, has_labels=True):
    model.eval()
    predictions = {}

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        claim_ids = batch["claim_id"]
        claim_texts = batch["claim_text"]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

        for idx, claim_id in enumerate(claim_ids):
            predictions[claim_id] = {
                "claim_text": claim_texts[idx],
                "claim_label": label_mapping_inverse[predicted[idx].item()],
                "evidences": batch["evidences"][idx]
            }

    json.dump(predictions, open(output_file, "w"))

# Save predictions on dev data
save_predictions(model, dev_dataloader, "dev_claims_out.json")

# Save predictions on test data
test_dataset = convertPytorchDataset(test_data, evidence_data, tokenizer, label_mapping)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
save_predictions(model, test_dataloader, "test_claims_out.json", has_labels=False)

New prediction and save

In [None]:
def save_predictions(claim_classification_model, evidence_retrieval_model, dataloader, output_file, evidence_data, has_labels=True):
    claim_classification_model.eval()
    evidence_retrieval_model.eval()
    predictions = {}

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        claim_ids = batch["claim_id"]
        claim_texts = batch["claim_text"]

        with torch.no_grad():
            outputs = claim_classification_model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

        for idx, claim_id in enumerate(claim_ids):
            claim_text = claim_texts[idx]
            evidence_scores = []
            for evidence_id, evidence_text in evidence_data.items():
                tokens = tokenizer.encode_plus(claim_text, evidence_text, truncation=True, padding="max_length", max_length=max_length_token)
                input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long).to(device)
                attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long).to(device)

                with torch.no_grad():
                    outputs = evidence_retrieval_model(input_ids, attention_mask=attention_mask)
                    _, predicted_evidence = torch.max(outputs.logits, 1)

                if predicted_evidence.item() == 1:
                    evidence_scores.append((evidence_id, outputs.logits[0, 1].item()))

            evidence_scores.sort(key=lambda x: x[1], reverse=True)
            top_evidence = [evidence_id for evidence_id, _ in evidence_scores[:5]]

            predictions[claim_id] = {
                "claim_text": claim_text,
                "claim_label": label_mapping_inverse[predicted[idx].item()],
                "evidences": top_evidence
            }

    with open(output_file, "w") as outfile:
        json.dump(predictions, outfile, indent=2)


# Save predictions on dev data
save_predictions(claim_classification_model, evidence_retrieval_model, dev_dataloader, "dev_claims_out.json", evidence_data)

# Save predictions on test data
test_dataset = convertPytorchDataset(test_data, evidence_data, tokenizer, label_mapping)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
save_predictions(claim_classification_model, evidence_retrieval_model, test_dataloader, "test_claims_out.json", evidence_data, has_labels=False)

               
