# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
import nltk
import pandas as pd
import numpy as np
import json
import ijson
import torch
from torch import nn


In [3]:
# 3min to process
pure_evidence_data = pd.read_json("data/evidence.json", lines=True)

In [4]:
pure_dev_data_set = pd.read_json("data/dev-claims.json")
pure_train_data_set = pd.read_json("data/train-claims.json")
pure_test_data_set = pd.read_json("data/test-claims-unlabelled.json")

In [5]:
def structurelise_data_set(data_set):
    structured_data_set = []
    for claim_id, claim_info in data_set.items():
        claim_text = claim_info['claim_text']
        claim_label = ''
        if 'claim_label' in claim_info:
            claim_label = claim_info['claim_label']
        evidence_ids = []
        if 'evidences' in claim_info:
            evidence_ids = claim_info['evidences']
        evidence_texts = []
        for evidence_id in evidence_ids:
            evidence_texts.append(pure_evidence_data[evidence_id][0])
        structured_data_set.append({
            'claim_id' : claim_id,
            'claim_text': claim_text,
            'evidence_texts': evidence_texts,
            'label': claim_label
        })
    return structured_data_set

structured_dev_data_set = structurelise_data_set(pure_dev_data_set)
structured_train_data_set = structurelise_data_set(pure_train_data_set)
structured_test_data_set = structurelise_data_set(pure_test_data_set)
print(structured_dev_data_set[0:5])
print(structured_train_data_set[0:5])
print(structured_test_data_set[0:5])
    

[{'claim_id': 'claim-752', 'claim_text': '[South Australia] has the most expensive electricity in the world.', 'evidence_texts': ['[citation needed] South Australia has the highest retail price for electricity in the country.', '"South Australia has the highest power prices in the world".'], 'label': 'SUPPORTS'}, {'claim_id': 'claim-375', 'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per cent, then no amount of emissions reductio\xadn here will have any effect on global climate.', 'evidence_texts': ['The 2011 UNEP Green Economy report states that "[a]agricultural operations, excluding land use changes, produce approximately 13 per cent of anthropogenic global GHG emissions.', 'With a market share of 30% and (potentially) clean electricity, heat pumps could reduce global CO 2 emissions by 8% annually.', 'In the modern era, emissions to the atmosphere from volcanoes are approximately 0.6

In [6]:
import re
def preprocess_text(text):
    if text:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_data_set(data_set):
    for item in data_set:
        item['claim_text'] = preprocess_text(item['claim_text'])
        item['evidence_texts'] = [preprocess_text(evidence) for evidence in item['evidence_texts']]
    return data_set

processed_dev_data_set = preprocess_data_set(structured_dev_data_set)
processed_train_data_set = preprocess_data_set(structured_train_data_set)
processed_test_data_set = preprocess_data_set(structured_test_data_set)
print(processed_dev_data_set[0:5])
print(processed_train_data_set[0:5])
print(processed_test_data_set[0:5])

[{'claim_id': 'claim-752', 'claim_text': 'south australia has the most expensive electricity in the world', 'evidence_texts': ['citation needed south australia has the highest retail price for electricity in the country', 'south australia has the highest power prices in the world'], 'label': 'SUPPORTS'}, {'claim_id': 'claim-375', 'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and australia produces 13 per cent of this 3 per cent then no amount of emissions reduction here will have any effect on global climate', 'evidence_texts': ['the 2011 unep green economy report states that aagricultural operations excluding land use changes produce approximately 13 per cent of anthropogenic global ghg emissions', 'with a market share of 30 and potentially clean electricity heat pumps could reduce global co 2 emissions by 8 annually', 'in the modern era emissions to the atmosphere from volcanoes are approximately 0645 billion tonnes of co 2 per year

In [7]:
structured_evidence_data = []
for evidence_id, evidence_text  in pure_evidence_data.items():
    structured_evidence_data.append({
        'evidence_id' : evidence_id,
        'evidence_text': evidence_text[0],
    })
print(structured_evidence_data[0:5])
for item in structured_evidence_data:
    item['evidence_text'] = preprocess_text(str(item['evidence_text']))
print(structured_evidence_data[0:5])

[{'evidence_id': 'evidence-0', 'evidence_text': 'John Bennet Lawes, English entrepreneur and agricultural scientist'}, {'evidence_id': 'evidence-1', 'evidence_text': 'Lindberg began his professional career at the age of 16, eventually moving to New York City in 1977.'}, {'evidence_id': 'evidence-2', 'evidence_text': "``Boston (Ladies of Cambridge)'' by Vampire Weekend"}, {'evidence_id': 'evidence-3', 'evidence_text': 'Gerald Francis Goyer (born October 20, 1936) was a professional ice hockey player who played 40 games in the National Hockey League.'}, {'evidence_id': 'evidence-4', 'evidence_text': 'He detected abnormalities of oxytocinergic function in schizoaffective mania, post-partum psychosis and how ECT modified oxytocin release.'}]
[{'evidence_id': 'evidence-0', 'evidence_text': 'john bennet lawes english entrepreneur and agricultural scientist'}, {'evidence_id': 'evidence-1', 'evidence_text': 'lindberg began his professional career at the age of 16 eventually moving to new york 

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def text_preprocessing(text):
    words = text.lower().split()
    new_words = []
    for w in words:
        w = lemmatize(w)
        if w not in stopwords:
            new_words.append(w)

    processed_text = " ".join(new_words)
    return processed_text

label_dict = {'SUPPORTS': 0,
    'REFUTES': 1,
    'NOT_ENOUGH_INFO': 2,
    'DISPUTED': 3
}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
from sklearn.utils import resample
train_df = pd.DataFrame(processed_train_data_set)
dev_df = pd.DataFrame(processed_dev_data_set)

train_df['label'] = train_df['label'].map(label_dict)
dev_df['label'] = dev_df['label'].map(label_dict)

train_df['evidence_texts'] = train_df['evidence_texts'].apply(lambda texts: " ".join(texts))
dev_df['evidence_texts'] = dev_df['evidence_texts'].apply(lambda texts: " ".join(texts))

train_df['claim_text'] = train_df['claim_text'] + " " + train_df['evidence_texts']
dev_df['claim_text'] = dev_df['claim_text'] + " " + dev_df['evidence_texts']

train_df = train_df.drop('evidence_texts', axis=1)
dev_df = dev_df.drop('evidence_texts', axis=1)

train_df['claim_text'] = train_df['claim_text'].apply(text_preprocessing)
dev_df['claim_text'] = dev_df['claim_text'].apply(text_preprocessing)

print(train_df.shape)
print(dev_df.shape)

(1228, 3)
(154, 3)


In [181]:
df_majority = train_df[train_df.label != 3]
df_minority = train_df[train_df.label == 3]
print(df_majority.shape)
print(df_minority.shape)
df_majority_sampled = resample(df_majority, 
                                 replace=True,     # 允许重采样
                                 n_samples=3 * len(df_minority),    # 使少数类样本数等于多数类
                                 random_state=42) # 固定随机种子
df_upsampled = pd.concat([df_minority, df_majority_sampled])
print(df_upsampled.shape)
label_counts = df_upsampled.groupby('label').size()
print(label_counts)

(1104, 3)
(124, 3)
(496, 3)


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_df['claim_text']).toarray()
X_dev_tfidf = vectorizer.transform(dev_df['claim_text']).toarray()
y_train = train_df['label']
y_dev = dev_df['label']
print(X_train_tfidf.shape)
print(X_dev_tfidf.shape)

(1228, 1000)
(154, 1000)


In [32]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_tfidf, y_train)
y_predict = dummy_clf.predict(X_dev_tfidf)
print(f'Accuracy: {accuracy_score(y_dev, y_predict):.4f}')
f1 = f1_score(y_dev, y_predict, average='macro')
print(f"F1-score: {f1:.4f}")


Accuracy: 0.4416
F1-score: 0.1532


In [27]:
import torch.optim as optim

class GRU(nn.Module):
    def __init__(self):
        super(GRU, self).__init__()
        self.gru = nn.GRU(input_size=1000, hidden_size=50, num_layers=1, batch_first=True)
        self.fc = nn.Linear(50, 4)

    def forward(self, x):
        x, _ = self.gru(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

model = GRU()

In [28]:

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, X_train, y_train, epochs=10):
    model.train()
    for epoch in range(epochs):
        inputs = torch.tensor(X_train).float().unsqueeze(1)
        labels = torch.tensor(y_train).long()
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

train_model(model, X_train_tfidf, y_train.values)


Epoch 1, Loss: 1.4006847143173218
Epoch 2, Loss: 1.3956753015518188
Epoch 3, Loss: 1.3907018899917603
Epoch 4, Loss: 1.3857567310333252
Epoch 5, Loss: 1.3808352947235107
Epoch 6, Loss: 1.3759336471557617
Epoch 7, Loss: 1.371047019958496
Epoch 8, Loss: 1.366171956062317
Epoch 9, Loss: 1.3613041639328003
Epoch 10, Loss: 1.3564410209655762


In [63]:


class GRU_Cell(nn.Module):

    def __init__(self,in_dim,hidden_dim):
        super(GRU_Cell,self).__init__()
        # reset gate
        self.rx = nn.Linear(in_dim,hidden_dim)
        self.rh = nn.Linear(hidden_dim,hidden_dim)

        self.zx = nn.Linear(in_dim,hidden_dim)
        self.zh = nn.Linear(hidden_dim,hidden_dim)

        self.hx = nn.Linear(in_dim,hidden_dim)
        self.hh = nn.Linear(hidden_dim,hidden_dim)

    def forward(self,x,h_1):
        r = torch.sigmoid(self.rx(x)+self.rh(h_1))
        z = torch.sigmoid(self.zx(x)+self.zh(h_1))
        h_ = torch.tanh(self.hx(x)+self.hh(r*h_1))
        h = z*h_1+(1 - z)*h_
        return h

class GRU(nn.Module):

    def __init__(self,in_dim,hidden_dim):
        super(GRU,self).__init__()
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.gru_cell = GRU_Cell(in_dim,hidden_dim)

    def forward(self,x):
        outs = []
        h = None
        for seq_x in x:
            if h is None:
                h = torch.randn(x.shape[1],self.hidden_dim)
            h = self.gru_cell.forward(seq_x,h)
            outs.append(torch.unsqueeze(h,0))
        outs = torch.cat(outs)
        return outs

if __name__ == '__main__':
    x = torch.randn(24, 12)
    h = torch.randn(24,6)
    rc = GRU_Cell(12,6)
    h = rc(x,h)
    print(h.shape)

    gru = GRU(12,6)
    x = torch.randn(7,24,12)
    outs,h = gru(x)
    print(outs.shape)
    print(h.shape)


torch.Size([24, 6])


ValueError: too many values to unpack (expected 2)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
def evaluate_model(model, X_test, y_test):
    model.eval()
    inputs = torch.tensor(X_test).float().unsqueeze(1)
    labels = torch.tensor(y_test).long()
    with torch.no_grad():
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == labels).sum().item()
        print(f'Accuracy: {correct / len(labels):.4f}')
        f1 = f1_score(labels.cpu(), predicted.cpu(), average='macro')
        print(f"F1-score: {f1:.4f}")


evaluate_model(model, X_dev_tfidf, y_dev.values)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*