# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import re
import nltk
import pandas as pd
import numpy as np
import json
import ijson
import keras

In [2]:
#Pre-processing 

from nltk.corpus import stopwords

nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

def preprocess_text(text):
    if text:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        words = text.split()
        new_words = []
        for w in words:
            w = lemmatize(w)
            if w not in stopwords:
                new_words.append(w)
        text = " ".join(new_words)
    return text

def text_preprocessing(data):
    if isinstance(data, str):
        return preprocess_text(data)
    elif isinstance(data, dict):
        data['claim_text'] = preprocess_text(data['claim_text'])
        data['evidence_texts'] = [preprocess_text(evidence) for evidence in data['evidence_texts']]
        return data
    elif isinstance(data, list):
        return [text_preprocessing(item) for item in data]
    else:
        raise ValueError("Unsupported data type")



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import json

# Load JSON files
def load_json(path):
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)

# Text preprocessing placeholder
def preprocess_text(text):
    return text  # Implement your actual preprocessing here

# Load data
train_claims = load_json("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/train-claims.json")
dev_claims = load_json("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/dev-claims.json")
test_claims = load_json("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/test-claims-unlabelled.json")
evidence_dict = load_json("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/evidence.json")

# Process evidence
evidence_texts = []
evidence_ids = []
evidence_index = {}
for idx, (evidence_id, evidence_text) in enumerate(evidence_dict.items()):
    evidence_ids.append(evidence_id)
    evidence_texts.append(preprocess_text(evidence_text))
    evidence_index[evidence_id] = idx

# Helper function to process claims
def process_claims(claims, include_evidences=True):
    ids = []
    texts = []
    labels = []
    evidences = []
    for claim_id, data in claims.items():
        ids.append(claim_id)
        texts.append(preprocess_text(data["claim_text"]))
        labels.append(data.get("claim_label", None))  # None for test claims
        if include_evidences and "evidences" in data:
            evidences.append([evidence_index[e_id] for e_id in data["evidences"]])
        else:
            evidences.append([])
    return ids, texts, labels, evidences

# Process datasets
train_ids, train_texts, train_labels, train_evidences = process_claims(train_claims)
dev_ids, dev_texts, dev_labels, dev_evidences = process_claims(dev_claims)
test_ids, test_texts, _, _ = process_claims(test_claims, include_evidences=False)  # No labels or evidences for test data


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer with max features set to 50,000
vectorizer = TfidfVectorizer(max_features=50000, min_df=1)

# Fit the vectorizer on the combined corpus of evidence texts, train texts, and test texts
combined_corpus = evidence_texts + train_texts + test_texts
vectorizer.fit(combined_corpus)

# Transform the train, dev, test, and evidence texts into TF-IDF features
train_tfidf = vectorizer.transform(train_texts)
dev_tfidf = vectorizer.transform(dev_texts)
test_tfidf = vectorizer.transform(test_texts)
evidence_tfidf = vectorizer.transform(evidence_texts)


# Print the shape of the train TF-IDF matrix
# This shows the number of documents in the training set and the number of features (terms)
print(train_tfidf.shape)

# Print the shape of the evidence TF-IDF matrix
# This shows the number of evidence documents and the number of features (terms)
print(evidence_tfidf.shape)


(1228, 50000)
(1208827, 50000)


In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between train texts and evidence texts
# This computes the cosine similarity for each pair of train and evidence documents
train_cos_sims = cosine_similarity(train_tfidf, evidence_tfidf)

# Calculate cosine similarity between dev texts and evidence texts
# This computes the cosine similarity for each pair of dev and evidence documents
dev_cos_sims = cosine_similarity(dev_tfidf, evidence_tfidf)

# Calculate cosine similarity between test texts and evidence texts
# This computes the cosine similarity for each pair of test and evidence documents
test_cos_sims = cosine_similarity(test_tfidf, evidence_tfidf)


In [17]:
import numpy as np

# Function to evaluate the performance of a retrieval system using cosine similarity scores
def test_retrieval_topk(k, cur_scores, cur_labels):
    """
    Evaluate the retrieval performance by calculating the recall at top-k.

    Parameters:
    k (int): Number of top results to consider for evaluation.
    cur_scores (np.ndarray): Cosine similarity scores.
    cur_labels (list of list): True labels (indices of relevant documents) for each query.

    Returns:
    None: Prints the average recall at top-k.
    """
    ACC = []
    for i in range(cur_scores.shape[0]):
        # Get the indices of the top-k highest scores for the i-th query
        cur_top_ids = np.argpartition(-cur_scores[i], k)[:k]
        
        # Calculate recall
        all_count = len(cur_labels[i])
        recall_count = sum(1 for cur_ in cur_labels[i] if cur_ in cur_top_ids)
        ACC.append(recall_count / all_count)
    
    # Print the average recall across all queries
    print(sum(ACC) / len(ACC))

# Define the top-k value
topK = 2000

# Evaluate the retrieval performance on training data
test_retrieval_topk(topK, train_cos_sims, train_evidences)

# Evaluate the retrieval performance on development data
test_retrieval_topk(topK, dev_cos_sims, dev_evidences)


0.6777958740499458
0.7348484848484849


Reconstruct Dataset

In [21]:
# Function to construct evidence candidates for claims based on cosine similarity scores
def construct_evidence_candidates(cos_sims, claim_text_data, claim_evidence_data, evidences_data, candi_num, train_mode=False):
    """
    Construct evidence candidates for claims based on cosine similarity scores.

    Parameters:
    cos_sims (np.ndarray): Cosine similarity scores.
    claim_text_data (list of str): Claim texts.
    claim_evidence_data (list of list of int): True evidence indices for each claim.
    evidences_data (list of str): Evidence texts.
    candi_num (int): Number of candidates to retrieve.
    train_mode (bool): Whether the function is in training mode (default: False).

    Returns:
    tuple: (candis, retrieval_cls_data, retrieval_cls_label)
        - candis (list of list of int): Candidate indices for each claim.
        - retrieval_cls_data (list of str): Combined claim and evidence texts for classification.
        - retrieval_cls_label (list of int): Labels for classification (1 if true evidence, 0 otherwise).
    """
    # Initialize lists to store results
    candis = []
    retrieval_cls_data = []
    retrieval_cls_label = []

    for i in range(cos_sims.shape[0]):
        if train_mode:
            # Add true evidences for training mode
            for k in claim_evidence_data[i]:
                retrieval_cls_data.append(f"<cls>{claim_text_data[i]}<sep>{evidences_data[k]}")
                retrieval_cls_label.append(1)
            
            # Retrieve additional candidates for training, avoiding true evidences
            cur_top_ids = np.argsort(-cos_sims[i])[candi_num * 5 : candi_num * 6].tolist()
        else:
            # Retrieve top-k candidates for evaluation mode
            cur_top_ids = np.argpartition(-cos_sims[i], candi_num)[:candi_num].tolist()

        candis.append(cur_top_ids)

        for j in cur_top_ids:
            retrieval_cls_data.append(f"<cls>{claim_text_data[i]}<sep>{evidences_data[j]}")
            if claim_evidence_data is not None:
                retrieval_cls_label.append(1 if j in claim_evidence_data[i] else 0)
    
    return candis, retrieval_cls_data, retrieval_cls_label

# Example usage
candi_num = 2000
train_mode = True  # or False, depending on your use case

# Assuming `train_cos_sims`, `train_texts`, `train_evidences`, and `evidence_texts` are already defined
train_candis, train_cls_data, train_cls_labels = construct_evidence_candidates(train_cos_sims, train_texts, train_evidences, evidence_texts, candi_num, train_mode)

# Similarly for dev and test datasets
dev_candis, dev_cls_data, dev_cls_labels = construct_evidence_candidates(dev_cos_sims, dev_texts, dev_evidences, evidence_texts, candi_num, train_mode=False)


In [22]:
dev_topK = 300
train_topK = 500

In [23]:
train_candis, train_retrieval_cls_data, train_retrieval_cls_label = construct_evidence_candidates(train_cos_sims, train_texts, train_evidences, evidences_texts, train_topK, True)

In [24]:
dev_candis, dev_retrieval_cls_data, dev_retrieval_cls_label = construct_evidence_candidates(dev_cos_sims, dev_texts, dev_evidences, evidences_texts, dev_topK, False)

In [25]:
test_candis, test_retrieval_cls_data, _ = construct_evidence_candidates(test_cos_sims, test_texts, None, evidences_texts, dev_topK, False)

In [26]:
train_retrieval_cls_label = np.array(train_retrieval_cls_label)
dev_retrieval_cls_label = np.array(dev_retrieval_cls_label)

train_retrieval_cls_data = np.array(train_retrieval_cls_data)
dev_retrieval_cls_data = np.array(dev_retrieval_cls_data)


In [29]:
from collections import Counter
print(Counter(train_retrieval_cls_label))
print(Counter(dev_retrieval_cls_label))

Counter({0: 613924, 1: 4198})
Counter({0: 45957, 1: 243})


In [30]:
613924/4198

45957/243

45957 / (45957+243)

0.9947402597402597

Keras Retrieval


In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_retrieval_cls_data)

In [32]:
vocab_size = len(tokenizer.word_index) + 1  # 0 is padding token
print(vocab_size)

225150


In [33]:
#tokenise the input into word sequences

xseq_train = tokenizer.texts_to_sequences(train_retrieval_cls_data)
xseq_dev = tokenizer.texts_to_sequences(dev_retrieval_cls_data)
xseq_test = tokenizer.texts_to_sequences(test_retrieval_cls_data)


In [34]:
max_i = 0
for i in xseq_train:
    max_i = max(max_i, len(i))
print(max_i)

max_i = 0
for i in xseq_dev:
    max_i = max(max_i, len(i))
print(max_i)


452
259


In [36]:

from keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.sequence import pad_sequences

maxlen = 180
xseq_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)


# Pad the sequences
xseq_train_padded = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev_padded = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test_padded = pad_sequences(xseq_test, padding='post', maxlen=maxlen)

# Print the padded sequences
print("Padded training sequences:")
print(xseq_train_padded)

print("\nPadded development sequences:")
print(xseq_dev_padded)

print("\nPadded test sequences:")
print(xseq_test_padded)

Padded training sequences:
[[   7   29   99 ...    0    0    0]
 [   7   29   99 ...    0    0    0]
 [   7   29   99 ...    0    0    0]
 ...
 [   7 3787 4634 ...    0    0    0]
 [   7 3787 4634 ...    0    0    0]
 [   7 3787 4634 ...    0    0    0]]

Padded development sequences:
[[   7  211  328 ...    0    0    0]
 [   7  211  328 ...    0    0    0]
 [   7  211  328 ...    0    0    0]
 ...
 [   7    2 1745 ...    0    0    0]
 [   7    2 1745 ...    0    0    0]
 [   7    2 1745 ...    0    0    0]]

Padded test sequences:
[[   7    2 1860 ...    0    0    0]
 [   7    2 1860 ...    0    0    0]
 [   7    2 1860 ...    0    0    0]
 ...
 [   7    5 2027 ...    0    0    0]
 [   7    5 2027 ...    0    0    0]
 [   7    5 2027 ...    0    0    0]]


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*