# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [46]:
import re
import nltk
import pandas as pd
import numpy as np
import json
import ijson

In [48]:
#Pre-processing 

from nltk.corpus import stopwords

nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

def preprocess_text(text):
    if text:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        words = text.split()
        new_words = []
        for w in words:
            w = lemmatize(w)
            if w not in stopwords:
                new_words.append(w)
        text = " ".join(new_words)
    return text

def text_preprocessing(data_set):
    for item in data_set:
        item['claim_text'] = preprocess_text(item['claim_text'])
        item['evidence_texts'] = [preprocess_text(evidence) for evidence in item['evidence_texts']]
    return data_set



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
# Load all file with json
train_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/train-claims.json", "r", encoding="utf-8"))
dev_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/dev-claims.json", "r", encoding="utf-8"))
test_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/test-claims-unlabelled.json", "r", encoding="utf-8"))
evidences = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/evidence.json", "r", encoding="utf-8"))

#Main evidence file. Creating empty lists for each text, ID, evidence, label
evidences_texts = []
evidences_ids = []
evidences_id_dict = {}
idx = 0
#For each evidence, seperate it's id, text etc and added into the empty list
for evidence_id, evidence_text in evidences.items():
    evidences_ids.append(evidence_id)
    evidences_texts.append(text_preprocessing(evidence_text))
    evidences_id_dict[evidence_id] = idx
    idx += 1


train_texts = []
train_evidences = []
train_labels = []
train_ids = []
for train_id, data in train_claims.items():
    train_ids.append(train_id)
    train_texts.append(text_preprocessing(data["claim_text"]))
    train_labels.append(data["claim_label"])
    train_evidences.append([evidences_id_dict[i] for i in data["evidences"]])


dev_texts = []
dev_evidences = []
dev_labels = []
dev_ids = []
for dev_id, data in dev_claims.items():
    dev_ids.append(dev_id)
    dev_texts.append(text_preprocessing(data["claim_text"]))
    dev_labels.append(data["claim_label"])
    dev_evidences.append([evidences_id_dict[i] for i in data["evidences"]])


test_ids = []
test_texts = []
for test_id, data in test_claims.items():
    test_ids.append(test_id)
    test_texts.append(text_preprocessing(data["claim_text"]))





In [62]:
# create temperary file stroe location: mkdir temp_data (in terminal)
json.dump(train_ids, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_ids.json", "w"))
json.dump(train_texts, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_texts.json", "w"))
json.dump(train_evidences, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_evidences.json", "w"))
json.dump(train_labels, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_labels.json", "w"))

json.dump(dev_ids, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_ids.json", "w"))
json.dump(dev_texts, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_texts.json", "w"))
json.dump(dev_evidences, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_evidences.json", "w"))
json.dump(dev_labels, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_labels.json", "w"))

json.dump(test_ids, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/test_ids.json", "w"))
json.dump(test_texts, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/test_texts.json", "w"))
json.dump(evidences_texts, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_texts.json", "w"))
json.dump(evidences_ids, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_ids.json", "w"))
json.dump(evidences_id_dict, open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_id_dict.json", "w"))

In [65]:
# make the files easier to access after debugging
train_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/train-claims.json", "r", encoding="utf-8"))
dev_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/dev-claims.json", "r", encoding="utf-8"))
test_claims = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/data/test-claims-unlabelled.json", "r", encoding="utf-8"))

train_ids = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_ids.json", "r", encoding="utf-8"))
train_texts = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_texts.json", "r", encoding="utf-8"))
train_evidences = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_evidences.json", "r", encoding="utf-8"))
train_labels = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/train_labels.json", "r", encoding="utf-8"))

dev_ids = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_ids.json", "r", encoding="utf-8"))
dev_texts = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_texts.json", "r", encoding="utf-8"))
dev_evidences = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_evidences.json", "r", encoding="utf-8"))
dev_labels = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/dev_labels.json", "r", encoding="utf-8"))

test_ids = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/test_ids.json", "r", encoding="utf-8"))
test_texts = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/test_texts.json", "r", encoding="utf-8"))
evidences_texts = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_texts.json", "r", encoding="utf-8"))
evidences_ids = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_ids.json", "r", encoding="utf-8"))
evidences_id_dict = json.load(open("C:/Users/Kaiya/Desktop/COMP90042_2024-main/temp_data/evidences_id_dict.json", "r", encoding="utf-8"))

In [80]:
# Tfi-Df from Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# added how much words into corpus (50000) 
vectorizer = TfidfVectorizer(max_features=50000 , min_df= 1)
vectorizer.fit(evidences_texts + train_texts + test_texts)

train_tfidf = vectorizer.transform(train_texts)
dev_tfidf = vectorizer.transform(dev_texts)
test_tfidf = vectorizer.transform(test_texts)
evidence_tfidf = vectorizer.transform(evidences_texts)

In [81]:
print(train_tfidf.shape)
print(evidence_tfidf.shape)

(1228, 50000)
(1208827, 50000)


In [84]:
# using numpy and scikit to calcualte the cosine similarity
train_cos_sims = np.dot(train_tfidf, evidence_tfidf.transpose()).toarray()
dev_cos_sims = np.dot(dev_tfidf, evidence_tfidf.transpose()).toarray()
test_cos_sims = np.dot(test_tfidf, evidence_tfidf.transpose()).toarray()

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*