# Project: Automated Fact Checking For Climate Science Claims

Student Name: Hongda Zhu

Student ID: 1259524

In [11]:
import sys
print(sys.version)

3.8.16 (default, Mar  1 2023, 21:19:10) 
[Clang 14.0.6 ]


# Load datasets

In [12]:
import json

def load_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

train_data = load_data("train-claims.json")
dev_data = load_data("dev-claims.json")
evidence_data = load_data("evidence.json")

# print("train_data: ", train_data)
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)
# for id, info in train_data.items():
#     print("claim_id: ", id)
#     print("claim_text: ", info['claim_text'])
#     print("claim_label: ", info['claim_label'])
#     print("evidences: ", info['evidences'])
#     print()
# print("dev_data: ", dev_data)
# print("evidence_data: ", evidence_data)

# Preprocessing the data

Tokenize the data

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Replace claims evidence id with actual evidence content in evidence data
# Tokenize them and combine together as the preprocessed data
def preprocess_data(claim_data, evidence_data, tokenizer):
    preprocessed_data = {}
    for claim_id, claim_info in claim_data.items():
        claim_text = claim_info["claim_text"]
        evidence_ids = claim_info["evidences"]
        claim_label = claim_info["claim_label"]

        # Tokenize claim data
        claim_tokens = tokenizer.tokenize(claim_text)

        # Tokenize evidence data
        evidence_tokens = []
        for evidence_id in evidence_ids:
            evidence_text = evidence_data[evidence_id]
            tokens = tokenizer.tokenize(evidence_text)
            evidence_tokens.append(tokens)

        # Combine and store preprocessed data
        preprocessed_data[claim_id] = {
            "claim_tokens": claim_tokens,
            "evidence_tokens": evidence_tokens,
            "claim_label": claim_label,
        }
    return preprocessed_data

# Preprocess the train and dev data
preprocessed_train_data = preprocess_data(train_data, evidence_data, tokenizer)
preprocessed_dev_data = preprocess_data(dev_data, evidence_data, tokenizer)

# print("preprocessed_train_data: ", preprocessed_train_data)
# print("preprocessed_dev_data: ", preprocessed_dev_data)


Create PyTorch Dataset and DataLoader for preprocessed data

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader

class convertPytorchDataset(Dataset):
    def __init__(self, claim_data, evidence_data, tokenizer, label_mapping):
        self.claim_data = claim_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer
        self.label_mapping = label_mapping

    def __len__(self):
        return len(self.claim_data)

    def __getitem__(self, idx):
        claim_id, claim_info = list(self.claim_data.items())[idx]
        claim_text = claim_info["claim_text"]
        evidence_texts = [self.evidence_data[evidence_id] for evidence_id in claim_info["evidences"]]
        all_evidence_text = ' '.join(evidence_texts)

        # Tokenize the data, make them in the same length and not too long, and return as pytorch tensor
        tokenized_data = self.tokenizer(claim_text, all_evidence_text, padding=True, truncation=True, return_tensors="pt")

        # Convert labels to numbers
        tokenized_data["labels"] = torch.tensor(self.label_mapping[claim_info["claim_label"]])

        return tokenized_data

label_mapping = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3,
}

train_dataset = convertPytorchDataset(train_data, evidence_data, tokenizer, label_mapping)
dev_dataset = convertPytorchDataset(dev_data, evidence_data, tokenizer, label_mapping)


def collate_fn(batch):
    max_len = max([x["input_ids"].shape[1] for x in batch])
    input_ids = torch.zeros(len(batch), max_len, dtype=torch.long)
    attention_mask = torch.zeros(len(batch), max_len, dtype=torch.long)
    labels = torch.zeros(len(batch), dtype=torch.long)
    
    for i, x in enumerate(batch):
        cur_len = x["input_ids"].shape[1]
        input_ids[i, :cur_len] = x["input_ids"]
        attention_mask[i, :cur_len] = x["attention_mask"]
        labels[i] = x["labels"]
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)


Printing samples from the train_dataset:
Sample 1:
  input_ids:  tensor([[  101,  2025,  2069,  2003,  2045,  2053,  4045,  3350,  2008,  2522,
          2475,  2003,  1037,  8554, 13210,  3372,  1010,  3020,  2522,  2475,
         14061,  2941,  2393, 20440,  2490,  2062,  3269,  1998,  4111,  2166,
          1012,   102,  2012,  2200,  2152, 14061,  1006,  2531,  2335, 12483,
          6693,  1010,  2030,  3618,  1007,  1010,  6351, 14384,  2064,  2022,
         11704,  2000,  4111,  2166,  1010,  2061,  6274,  1996,  6693,  2000,
          2184,  1010,  2199,  4903,  2213,  1006,  1015,  1003,  1007,  2030,
          3020,  2005,  2195,  2847,  2097, 11027, 20739,  2015,  2107,  2004,
          2317, 24019,  1998,  6804, 10210,  2229,  1999,  1037, 16635,  1012,
           102]])
  attention_mask:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1

# Training the model

In [28]:
from transformers import BertForSequenceClassification

num_labels = len(label_mapping)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# If gpu available, use gpu.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 3
lr = 3e-5

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3


Training:   4%|▍         | 6/154 [00:26<10:51,  4.40s/it]


KeyboardInterrupt: 

# Evaluation

In [20]:
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    total_accuracy = 0.0
    num_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            _, predictions = torch.max(outputs.logits, dim=1)

            total_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            num_batches += 1

    return total_accuracy / num_batches

# Evaluate the model on the validation set
val_accuracy = evaluate(model, dev_dataloader, device)
print(f"Validation Accuracy: {val_accuracy:.4f}")


Validation Accuracy: 0.4938
