# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import nltk
import pandas as pd
import numpy as np
import json
import ijson
import torch
from torch import nn


In [2]:
# 3min to process
pure_evidence_data = pd.read_json("data/evidence.json", lines=True)

In [61]:
pure_dev_data_set = pd.read_json("data/dev-claims.json")
pure_train_data_set = pd.read_json("data/train-claims.json")
pure_test_data_set = pd.read_json("data/test-claims-unlabelled.json")

In [62]:
def structurelise_data_set(data_set):
    structured_data_set = []
    for claim_id, claim_info in data_set.items():
        claim_text = claim_info['claim_text']
        claim_label = ''
        if 'claim_label' in claim_info:
            claim_label = claim_info['claim_label']
        evidence_ids = []
        if 'evidences' in claim_info:
            evidence_ids = claim_info['evidences']
        evidence_texts = []
        for evidence_id in evidence_ids:
            evidence_texts.append(pure_evidence_data[evidence_id][0])
        structured_data_set.append({
            'claim_id' : claim_id,
            'claim_text': claim_text,
            'evidence_texts': evidence_texts,
            'label': claim_label
        })
    return structured_data_set

structured_dev_data_set = structurelise_data_set(pure_dev_data_set)
structured_train_data_set = structurelise_data_set(pure_train_data_set)
structured_test_data_set = structurelise_data_set(pure_test_data_set)
print(structured_dev_data_set[0:5])
print(structured_train_data_set[0:5])
print(structured_test_data_set[0:5])
    

[{'claim_id': 'claim-752', 'claim_text': '[South Australia] has the most expensive electricity in the world.', 'evidence_texts': ['[citation needed] South Australia has the highest retail price for electricity in the country.', '"South Australia has the highest power prices in the world".'], 'label': 'SUPPORTS'}, {'claim_id': 'claim-375', 'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per cent, then no amount of emissions reductio\xadn here will have any effect on global climate.', 'evidence_texts': ['The 2011 UNEP Green Economy report states that "[a]agricultural operations, excluding land use changes, produce approximately 13 per cent of anthropogenic global GHG emissions.', 'With a market share of 30% and (potentially) clean electricity, heat pumps could reduce global CO 2 emissions by 8% annually.', 'In the modern era, emissions to the atmosphere from volcanoes are approximately 0.6

In [63]:
import re
def preprocess_text(text):
    if text:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_data_set(data_set):
    for item in data_set:
        item['claim_text'] = preprocess_text(item['claim_text'])
        item['evidence_texts'] = [preprocess_text(evidence) for evidence in item['evidence_texts']]
    return data_set

processed_dev_data_set = preprocess_data_set(structured_dev_data_set)
processed_train_data_set = preprocess_data_set(structured_train_data_set)
processed_test_data_set = preprocess_data_set(structured_test_data_set)
print(processed_dev_data_set[0:5])
print(processed_train_data_set[0:5])
print(processed_test_data_set[0:5])

[{'claim_id': 'claim-752', 'claim_text': 'south australia has the most expensive electricity in the world', 'evidence_texts': ['citation needed south australia has the highest retail price for electricity in the country', 'south australia has the highest power prices in the world'], 'label': 'SUPPORTS'}, {'claim_id': 'claim-375', 'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and australia produces 13 per cent of this 3 per cent then no amount of emissions reduction here will have any effect on global climate', 'evidence_texts': ['the 2011 unep green economy report states that aagricultural operations excluding land use changes produce approximately 13 per cent of anthropogenic global ghg emissions', 'with a market share of 30 and potentially clean electricity heat pumps could reduce global co 2 emissions by 8 annually', 'in the modern era emissions to the atmosphere from volcanoes are approximately 0645 billion tonnes of co 2 per year

In [64]:
structured_evidence_data = []
for evidence_id, evidence_text  in pure_evidence_data.items():
    structured_evidence_data.append({
        'evidence_id' : evidence_id,
        'evidence_text': evidence_text[0],
    })
print(structured_evidence_data[0:5])
for item in structured_evidence_data:
    item['evidence_text'] = preprocess_text(str(item['evidence_text']))
print(structured_evidence_data[0:5])

[{'evidence_id': 'evidence-0', 'evidence_text': 'John Bennet Lawes, English entrepreneur and agricultural scientist'}, {'evidence_id': 'evidence-1', 'evidence_text': 'Lindberg began his professional career at the age of 16, eventually moving to New York City in 1977.'}, {'evidence_id': 'evidence-2', 'evidence_text': "``Boston (Ladies of Cambridge)'' by Vampire Weekend"}, {'evidence_id': 'evidence-3', 'evidence_text': 'Gerald Francis Goyer (born October 20, 1936) was a professional ice hockey player who played 40 games in the National Hockey League.'}, {'evidence_id': 'evidence-4', 'evidence_text': 'He detected abnormalities of oxytocinergic function in schizoaffective mania, post-partum psychosis and how ECT modified oxytocin release.'}]
[{'evidence_id': 'evidence-0', 'evidence_text': 'john bennet lawes english entrepreneur and agricultural scientist'}, {'evidence_id': 'evidence-1', 'evidence_text': 'lindberg began his professional career at the age of 16 eventually moving to new york 

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [65]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def text_preprocessing(text):
    words = text.lower().split()
    new_words = []
    for w in words:
        w = lemmatize(w)
        if w not in stopwords:
            new_words.append(w)

    processed_text = " ".join(new_words)
    return processed_text

# map label from string to number
label_dict = {'SUPPORTS': 0,
    'REFUTES': 1,
    'NOT_ENOUGH_INFO': 2,
    'DISPUTED': 3
}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Combind claim and envidence text

In [132]:
from sklearn.utils import resample
train_df = pd.DataFrame(processed_train_data_set)
dev_df = pd.DataFrame(processed_dev_data_set)

# change label from string to number
train_df['label'] = train_df['label'].map(label_dict)
dev_df['label'] = dev_df['label'].map(label_dict)

# combine all envidence texts
train_df['evidence_texts'] = train_df['evidence_texts'].apply(lambda texts: " ".join(texts))
dev_df['evidence_texts'] = dev_df['evidence_texts'].apply(lambda texts: " ".join(texts))

# preprocessing
train_df['claim_text'] = train_df['claim_text'].apply(text_preprocessing)
train_df['evidence_texts'] = train_df['evidence_texts'].apply(text_preprocessing)
dev_df['claim_text'] = dev_df['claim_text'].apply(text_preprocessing)
dev_df['evidence_texts'] = dev_df['evidence_texts'].apply(text_preprocessing)

print(train_df.head(1))
print(train_df.shape)
print(dev_df.shape)

     claim_id                                         claim_text  \
0  claim-1937  scientific evidence co2 pollutant higher co2 c...   

                                      evidence_texts  label  
0  high concentration 100 time atmospheric concen...      3  
(1228, 4)
(154, 4)


Resample to balance the dev data

In [99]:
print(dev_df['label'].value_counts())
# get data count by labels
df_SUPPORTS = dev_df[dev_df.label == 0]
df_REFUTES = dev_df[dev_df.label == 1]
df_NOT_ENOUGH_INFO = dev_df[dev_df.label == 2]
df_DISPUTED = dev_df[dev_df.label == 3]

# resample
df_SUPPORTS_sampled = resample(df_SUPPORTS, replace=True, n_samples=len(df_DISPUTED), random_state=1)
df_REFUTES_sampled = resample(df_REFUTES, replace=True, n_samples=len(df_DISPUTED), random_state=2)
df_NOT_ENOUGH_INFO_sampled = resample(df_NOT_ENOUGH_INFO, replace=True, n_samples=len(df_DISPUTED), random_state=3)

# concat them
df_sampled = pd.concat([df_SUPPORTS_sampled, df_REFUTES_sampled, df_NOT_ENOUGH_INFO_sampled, df_DISPUTED])
print(df_sampled['label'].value_counts())

label
0    68
2    41
1    27
3    18
Name: count, dtype: int64
label
0    18
1    18
2    18
3    18
Name: count, dtype: int64


Splite data

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer feature and get top 1000
vectorizer1 = TfidfVectorizer(max_features=1000)
vectorizer2 = TfidfVectorizer(max_features=1000)

# splite data into X_train, y_train, X_dev and y_dev
X_train_tfidf_claim = pd.DataFrame(vectorizer1.fit_transform(train_df['claim_text']).toarray(), columns=vectorizer1.get_feature_names_out())
X_train_tfidf_evidence = pd.DataFrame(vectorizer2.fit_transform(train_df['evidence_texts']).toarray(), columns=vectorizer2.get_feature_names_out())
X_train_tfidf = pd.concat([X_train_tfidf_claim, X_train_tfidf_evidence], axis=1)

X_dev_tfidf_claim = pd.DataFrame(vectorizer1.transform(dev_df['claim_text']).toarray(), columns=vectorizer1.get_feature_names_out())
X_dev_tfidf_evidence = pd.DataFrame(vectorizer2.transform(dev_df['evidence_texts']).toarray(), columns=vectorizer2.get_feature_names_out())
X_dev_tfidf = pd.concat([X_dev_tfidf_claim, X_dev_tfidf_evidence], axis=1)

y_train = train_df['label']
y_dev = dev_df['label']
print(X_train_tfidf.shape)
print(X_dev_tfidf.shape)

(1228, 2000)
(154, 2000)


Baseline model: Zero-rule

In [134]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_tfidf, y_train)
y_predict = dummy_clf.predict(X_dev_tfidf)
print(f'Accuracy: {accuracy_score(y_dev, y_predict):.4f}')
f1 = f1_score(y_dev, y_predict, average='macro')
print(f"F1-score: {f1:.4f}")


Accuracy: 0.4416
F1-score: 0.1532


In [135]:
X_train_tfidf = X_train_tfidf.values.reshape(1228, 2, 1000)
X_train_tfidf = torch.tensor(X_train_tfidf, dtype=torch.float32)

X_dev_tfidf = X_dev_tfidf.values.reshape(154, 2, 1000)
X_dev_tfidf = torch.tensor(X_dev_tfidf, dtype=torch.float32)

y_train = torch.tensor(y_train.values, dtype=torch.int64)
y_dev = torch.tensor(y_dev.values, dtype=torch.int64)

print(X_train_tfidf.shape)
print(X_dev_tfidf.shape)

torch.Size([1228, 2, 1000])
torch.Size([154, 2, 1000])


In [136]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

input_size = 1000
hidden_size = 128
num_layers = 1
output_size = 4

# 
model = GRUModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


for epoch in range(num_epochs):

    model.train()
    optimizer.zero_grad()
    
    outputs = model(X_train_tfidf)
    loss = criterion(outputs, y_train)
    
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')



Epoch [1/5], Loss: 1.3925
Epoch [2/5], Loss: 1.3062
Epoch [3/5], Loss: 1.2390
Epoch [4/5], Loss: 1.2089
Epoch [5/5], Loss: 1.1904


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [139]:
from sklearn.metrics import accuracy_score, classification_report
model.eval()

with torch.no_grad():

    outputs = model(X_dev_tfidf)
    _, predicted = torch.max(outputs.data, 1)
    
    predictions = predicted.numpy()
    labels = y_dev.numpy()


accuracy = accuracy_score(labels, predictions)
print(f'Accuracy: {accuracy:.4f}')

class_report = classification_report(labels, predictions,zero_division=0)
print(class_report)

Accuracy: 0.4286
              precision    recall  f1-score   support

           0       0.44      0.91      0.59        68
           1       0.00      0.00      0.00        27
           2       0.31      0.10      0.15        41
           3       0.00      0.00      0.00        18

    accuracy                           0.43       154
   macro avg       0.19      0.25      0.19       154
weighted avg       0.28      0.43      0.30       154



## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*