### Import Data

In [17]:
import json
import pandas as pd

#from google.colab import drive
#drive.mount("/content/drive")

def read_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    data = [json.loads(line) for line in lines]
    return pd.DataFrame(data)

#upload data to the folder called "NLP data"
#evidence = read_json("/content/drive/My Drive/NLP data/evidence.json")
#dev_claims = read_json("/content/drive/My Drive/NLP data/dev-claims.json")
#dev_claims_baseline = read_json("/content/drive/My Drive/NLP data/dev-claims-baseline.json")
#test_claims_unlabelled = read_json("/content/drive/My Drive/NLP data/test-claims-unlabelled.json")
#train_claims = read_json("/content/drive/My Drive/NLP data/train-claims.json")

evidence_df = read_json("evidence.json")
dev_claims_df = read_json("dev-claims.json")
dev_claims_baseline_df = read_json("dev-claims-baseline.json")
test_claims_unlabelled_df = read_json("test-claims-unlabelled.json")
train_claims_df = read_json("train-claims.json")

### Data Structure Preprocessing (1-1)

In [18]:
train_claims_data = []
for claim_id, claim_info in train_claims_df.items():
    for evidence in claim_info[0]['evidences']:
        update_info = {
            "claim_ID": claim_id,
            "claim_text": claim_info[0]['claim_text'],
            "claim_label": claim_info[0]['claim_label'],
            "evidences": evidence
        }
        train_claims_data.append(update_info)

In [19]:
print(train_claims_data[:2])

[{'claim_ID': 'claim-1937', 'claim_text': 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'claim_label': 'DISPUTED', 'evidences': 'evidence-442946'}, {'claim_ID': 'claim-1937', 'claim_text': 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'claim_label': 'DISPUTED', 'evidences': 'evidence-1194317'}]


In [20]:
dev_claims_data = []
for claim_id, claim_info in dev_claims_df.items():
    update_info = {
        "claim_ID": claim_id,
        "claim_text": claim_info[0]['claim_text'],
        "claim_label": claim_info[0]['claim_label'],
        "evidences": claim_info[0]['evidences']
    }
    dev_claims_data.append(update_info)

In [21]:
evidence_data = []

for col in evidence_df.columns:
    evidence_text = evidence_df[col][0]
    evidence_id = col
    evidence_data.append({
        "evidence_ID": evidence_id,
        "evidence_text": evidence_text
    })

### Data Preprocessing

In [22]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) 
    words = word_tokenize(text)
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yusi2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yusi2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yusi2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### TF-IDF

In [23]:
#tfidf vectorize_without ngram
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

claims_texts = [preprocess_text(claim['claim_text']) for claim in train_claims_data]
evidence_texts = [preprocess_text(evidence['evidence_text']) for evidence in evidence_data]
all_texts = claims_texts + evidence_texts

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)

In [24]:
dev_claims_texts = [preprocess_text(claim['claim_text']) for claim in dev_claims_data]
dev_claim_tfidf = vectorizer.transform(dev_claims_texts)

In [25]:
evidence_texts = [preprocess_text(evidence['evidence_text']) for evidence in evidence_data]
evidence_tfidf = vectorizer.transform(evidence_texts)

### Jacob similarity

In [26]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [27]:

def get_word_set_from_tfidf(tfidf_matrix, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    word_sets = []
    for i in range(tfidf_matrix.shape[0]):
        words = [feature_names[j] for j in tfidf_matrix[i].indices]
        word_sets.append(set(words))
    return word_sets

# 轉換 dev_claims 和 evidence_texts 成詞集
dev_claim_word_sets = get_word_set_from_tfidf(dev_claim_tfidf, vectorizer)
evidence_word_sets = get_word_set_from_tfidf(evidence_tfidf, vectorizer)


'\ndef get_word_set_from_tfidf(tfidf_matrix, vectorizer):\n    feature_names = vectorizer.get_feature_names_out()\n    word_sets = []\n    for i in range(tfidf_matrix.shape[0]):\n        words = [feature_names[j] for j in tfidf_matrix[i].indices]\n        word_sets.append(set(words))\n    return word_sets\n\n# 轉換 dev_claims 和 evidence_texts 成詞集\ndev_claim_word_sets = get_word_set_from_tfidf(dev_claim_tfidf, vectorizer)\nevidence_word_sets = get_word_set_from_tfidf(evidence_tfidf, vectorizer)\n'

In [28]:

import numpy as np

# 設定批處理大小
evidence_batch_size = 1000  # 根據你的內存設定

# 初始化存儲結構
dev_evidences_top5 = [[] for _ in range(len(dev_claims_data))]

# 分批處理證據集
for start_idx in range(0, len(evidence_word_sets), evidence_batch_size):
    end_idx = min(start_idx + evidence_batch_size, len(evidence_word_sets))
    evidence_batch = evidence_word_sets[start_idx:end_idx]

    # 計算每個 dev_claim 與當前批次中的每個 evidence 的相似度
    for i, claim_set in enumerate(dev_claim_word_sets):
        for j, evidence_set in enumerate(evidence_batch, start=start_idx):
            similarity = jaccard_similarity(claim_set, evidence_set)

            # 維護一個長度為5的最高相似度列表
            if len(dev_evidences_top5[i]) < 1000:
                dev_evidences_top5[i].append((similarity, evidence_data[j]['evidence_ID']))
                dev_evidences_top5[i].sort(reverse=True, key=lambda x: x[0])
            else:
                if similarity > dev_evidences_top5[i][-1][0]:
                    dev_evidences_top5[i][-1] = (similarity, evidence_data[j]['evidence_ID'])
                    dev_evidences_top5[i].sort(reverse=True, key=lambda x: x[0])

# 格式化最終結果
dev_evidences_pred_jacob = [{
    'dev_claim_ID': dev_claims_data[i]['claim_ID'],
    'evidence_IDs': [e[1] for e in top5],
    'evidence_sims': [e[0] for e in top5]
} for i, top5 in enumerate(dev_evidences_top5)]


"\nimport numpy as np\n\n# 設定批處理大小\nevidence_batch_size = 1000  # 根據你的內存設定\n\n# 初始化存儲結構\ndev_evidences_top5 = [[] for _ in range(len(dev_claims_data))]\n\n# 分批處理證據集\nfor start_idx in range(0, len(evidence_word_sets), evidence_batch_size):\n    end_idx = min(start_idx + evidence_batch_size, len(evidence_word_sets))\n    evidence_batch = evidence_word_sets[start_idx:end_idx]\n\n    # 計算每個 dev_claim 與當前批次中的每個 evidence 的相似度\n    for i, claim_set in enumerate(dev_claim_word_sets):\n        for j, evidence_set in enumerate(evidence_batch, start=start_idx):\n            similarity = jaccard_similarity(claim_set, evidence_set)\n\n            # 維護一個長度為5的最高相似度列表\n            if len(dev_evidences_top5[i]) < 1000:\n                dev_evidences_top5[i].append((similarity, evidence_data[j]['evidence_ID']))\n                dev_evidences_top5[i].sort(reverse=True, key=lambda x: x[0])\n            else:\n                if similarity > dev_evidences_top5[i][-1][0]:\n                    dev_evidences_

### Output

In [29]:
output = pd.DataFrame(dev_evidences_pred_jacob)
output = output.drop_duplicates(subset=['dev_claim_ID'])
output.to_csv('dev_evidences_pred_jacob_1000.csv', index=False)


"\noutput = pd.DataFrame(dev_evidences_pred_jacob)\noutput = output.drop_duplicates(subset=['dev_claim_ID'])\noutput.to_csv('dev_evidences_pred_jacob_1000.csv', index=False)\n"

### Ouput read

In [30]:
'''
from ast import literal_eval
data_frame = pd.read_csv('dev_evidences_pred_jacob_1000.csv')

dev_evidences_pred_jacob = []

for index, row in data_frame.iterrows():
    evidence_ids = literal_eval(row['evidence_IDs'])
    evidence_sims = literal_eval(row['evidence_sims'])
    

    entry = {
        'dev_claim_ID': row['dev_claim_ID'],
        'evidence_IDs': evidence_ids,
        'evidence_sims': evidence_sims
    }
    dev_evidences_pred_jacob.append(entry)


print(dev_evidences_pred_jacob[:5])
'''

[{'dev_claim_ID': 'claim-752', 'evidence_IDs': ['evidence-572512', 'evidence-511300', 'evidence-185267', 'evidence-240255', 'evidence-252686', 'evidence-335140', 'evidence-394947', 'evidence-580844', 'evidence-589336', 'evidence-702160', 'evidence-717829', 'evidence-719961', 'evidence-786054', 'evidence-793855', 'evidence-838212', 'evidence-879910', 'evidence-966080', 'evidence-1035636', 'evidence-1130893', 'evidence-1178915', 'evidence-67732', 'evidence-514', 'evidence-50904', 'evidence-61626', 'evidence-75409', 'evidence-134083', 'evidence-255095', 'evidence-291834', 'evidence-324518', 'evidence-338767', 'evidence-413675', 'evidence-509345', 'evidence-516313', 'evidence-531624', 'evidence-679645', 'evidence-684667', 'evidence-685383', 'evidence-713845', 'evidence-729032', 'evidence-753206', 'evidence-805925', 'evidence-819633', 'evidence-908115', 'evidence-935489', 'evidence-941027', 'evidence-1010949', 'evidence-1020585', 'evidence-1033967', 'evidence-1042042', 'evidence-1056772', '

### Evaluation

In [31]:
'''
dev_claims_df = pd.DataFrame(dev_claims_data)

correct_evidences = dev_claims_df.groupby('claim_ID')['evidences'].apply(list).to_dict()

def calculate_accuracy(predictions, correct_evidences):
#非真實accuracy[(TP+TN) / (TP+TN+FP+FN)]
#說明每個claim對於其預測的前五個證據中至少預測到一個正確證據的比例
    top_5_accuracy_count = 0
    total_claims = len(predictions)
    
    for pred in predictions:
        claim_id = pred['dev_claim_ID']
        predicted_evidences = pred['evidence_IDs']
        correct_evidences_list = correct_evidences.get(claim_id, [])
        
        set_predicted_top5 = set(predicted_evidences[:1000])
        set_correct = set(correct_evidences_list)
        if set_predicted_top5 & set_correct: #至少有一個正確的證據
            top_5_accuracy_count += 1

    top_5_accuracy = top_5_accuracy_count / total_claims if total_claims > 0 else 0

    return  top_5_accuracy

# 計算至少預測到一個正確證據的情況比例
acc_jacob = calculate_accuracy(dev_evidences_pred_jacob, correct_evidences)
print("Jacob Accuracy:", acc_jacob)
'''

'\ndev_claims_df = pd.DataFrame(dev_claims_data)\n\ncorrect_evidences = dev_claims_df.groupby(\'claim_ID\')[\'evidences\'].apply(list).to_dict()\n\ndef calculate_accuracy(predictions, correct_evidences):\n#非真實accuracy[(TP+TN) / (TP+TN+FP+FN)]\n#說明每個claim對於其預測的前五個證據中至少預測到一個正確證據的比例\n    top_5_accuracy_count = 0\n    total_claims = len(predictions)\n    \n    for pred in predictions:\n        claim_id = pred[\'dev_claim_ID\']\n        predicted_evidences = pred[\'evidence_IDs\']\n        correct_evidences_list = correct_evidences.get(claim_id, [])\n        \n        set_predicted_top5 = set(predicted_evidences[:1000])\n        set_correct = set(correct_evidences_list)\n        if set_predicted_top5 & set_correct: #至少有一個正確的證據\n            top_5_accuracy_count += 1\n\n    top_5_accuracy = top_5_accuracy_count / total_claims if total_claims > 0 else 0\n\n    return  top_5_accuracy\n\n# 計算至少預測到一個正確證據的情況比例\nacc_jacob = calculate_accuracy(dev_evidences_pred_jacob, correct_evidences)\nprint("Jac

In [32]:
'''
def evaluate_metrics(predictions, correct_evidences):
    TP, FP, TN, FN = 0, 0, 0, 0
    
#TP: 預測到真證據
#FP: 預測到假證據
#TN: 沒預測到的假證據
#FN: 沒預測到的真證據

    for pred in predictions:
        claim_id = pred['dev_claim_ID']
        predicted_evidences = pred['evidence_IDs']
        correct_evidences_list = correct_evidences.get(claim_id, [])
        
        set_predicted_top5 = set(predicted_evidences[:1000])
        set_correct = set(correct_evidences_list)
        
        TP += len(set_predicted_top5 & set_correct)
        FP += len(set_predicted_top5 - set_correct)
        FN += len(set_correct - set_predicted_top5)
        # 太多證據降低運算速度，先不做TN
        
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

precision_jacob, recall_jacob, f1_jacob = evaluate_metrics(dev_evidences_pred_jacob, correct_evidences)
print("Jacob Precision:", precision_jacob) #在模型標記為相關的證據中，有多少是真正相關的
print("Jacob Recall:", recall_jacob) #在所有實際相關的證據中，模型找到了多少
print("Jacob F1 Score:", f1_jacob)
'''

'\ndef evaluate_metrics(predictions, correct_evidences):\n    TP, FP, TN, FN = 0, 0, 0, 0\n    \n#TP: 預測到真證據\n#FP: 預測到假證據\n#TN: 沒預測到的假證據\n#FN: 沒預測到的真證據\n\n    for pred in predictions:\n        claim_id = pred[\'dev_claim_ID\']\n        predicted_evidences = pred[\'evidence_IDs\']\n        correct_evidences_list = correct_evidences.get(claim_id, [])\n        \n        set_predicted_top5 = set(predicted_evidences[:1000])\n        set_correct = set(correct_evidences_list)\n        \n        TP += len(set_predicted_top5 & set_correct)\n        FP += len(set_predicted_top5 - set_correct)\n        FN += len(set_correct - set_predicted_top5)\n        # 太多證據降低運算速度，先不做TN\n        \n    precision = TP / (TP + FP) if (TP + FP) > 0 else 0\n    recall = TP / (TP + FN) if (TP + FN) > 0 else 0\n    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n    \n    return precision, recall, f1_score\n\nprecision_jacob, recall_jacob, f1_jacob = evaluate_metrics(dev

In [33]:
import numpy as np
# selection: {claim_id: [evidence_id]}
# target_file: "data/dev-claims.json" or "data/train-claims.json" or "data/claims.json"
# return: recall, precision, F1

def eval_selection(selection, target_file):
    with open(target_file, 'r') as file:
        claims = json.load(file)
    selection_total = 0
    target_total = 0
    correct = 0
    for claim_id, evidence_ids in selection.items():
        claim = claims[str(claim_id)]
        for evidence_id in evidence_ids:
            if f"{evidence_id}" in claim["evidences"]:
                correct += 1
        selection_total += len(evidence_ids)
        target_total += len(claim["evidences"])
    recall = correct / target_total
    precision = correct / selection_total
    F1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else np.nan
    return recall, precision, F1



In [34]:
selection = {}

for item in dev_evidences_pred_jacob:
    claim_id = item['dev_claim_ID']
    evidence_ids = item['evidence_IDs']

    selection[claim_id] = evidence_ids

print(selection)

{'claim-752': ['evidence-572512', 'evidence-511300', 'evidence-185267', 'evidence-240255', 'evidence-252686', 'evidence-335140', 'evidence-394947', 'evidence-580844', 'evidence-589336', 'evidence-702160', 'evidence-717829', 'evidence-719961', 'evidence-786054', 'evidence-793855', 'evidence-838212', 'evidence-879910', 'evidence-966080', 'evidence-1035636', 'evidence-1130893', 'evidence-1178915', 'evidence-67732', 'evidence-514', 'evidence-50904', 'evidence-61626', 'evidence-75409', 'evidence-134083', 'evidence-255095', 'evidence-291834', 'evidence-324518', 'evidence-338767', 'evidence-413675', 'evidence-509345', 'evidence-516313', 'evidence-531624', 'evidence-679645', 'evidence-684667', 'evidence-685383', 'evidence-713845', 'evidence-729032', 'evidence-753206', 'evidence-805925', 'evidence-819633', 'evidence-908115', 'evidence-935489', 'evidence-941027', 'evidence-1010949', 'evidence-1020585', 'evidence-1033967', 'evidence-1042042', 'evidence-1056772', 'evidence-1115155', 'evidence-1140

In [35]:
print(eval_selection(selection,"dev-claims.json"))
#50: (0.28105906313645623, 0.017922077922077922, 0.03369551947259187)

(0.5926680244399185, 0.0018896103896103896, 0.003767209740373225)


In [36]:
import copy


dev_evidences_pred_jacob_test = copy.deepcopy(dev_evidences_pred_jacob)

for claim in dev_evidences_pred_jacob_test:

    original_sims = claim['evidence_sims']
    original_ids = claim['evidence_IDs']

    filtered_evidence = [(sim, ev_id) for sim, ev_id in zip(original_sims, original_ids) if sim >= 0.05]
    

    if not filtered_evidence:
        max_sim = max(original_sims) 
        max_index = original_sims.index(max_sim) 
        filtered_evidence = [(original_sims[max_index], original_ids[max_index])]
    
    claim['evidence_IDs'] = [ev_id for _, ev_id in filtered_evidence]
    claim['evidence_sims'] = [sim for sim, _ in filtered_evidence]

In [37]:
selection2 = {}

for item in dev_evidences_pred_jacob_test:
    claim_id = item['dev_claim_ID']
    evidence_ids = item['evidence_IDs'][:5]

    selection2[claim_id] = evidence_ids

print(selection2)

{'claim-752': ['evidence-572512', 'evidence-511300', 'evidence-185267', 'evidence-240255', 'evidence-252686'], 'claim-375': ['evidence-864147', 'evidence-631053', 'evidence-778407', 'evidence-944672', 'evidence-1207394'], 'claim-1266': ['evidence-694262', 'evidence-402339', 'evidence-1207903', 'evidence-16482', 'evidence-38305'], 'claim-871': ['evidence-366901', 'evidence-617077', 'evidence-1017572', 'evidence-363992', 'evidence-631019'], 'claim-2164': ['evidence-52981', 'evidence-1134268', 'evidence-399454', 'evidence-885328', 'evidence-334226'], 'claim-1607': ['evidence-668884', 'evidence-397504', 'evidence-411975', 'evidence-955328', 'evidence-1079566'], 'claim-761': ['evidence-664808', 'evidence-620340', 'evidence-919', 'evidence-88652', 'evidence-600976'], 'claim-1718': ['evidence-545018', 'evidence-1104838', 'evidence-45045', 'evidence-85719', 'evidence-381517'], 'claim-1273': ['evidence-107843', 'evidence-789848', 'evidence-1154653', 'evidence-249967', 'evidence-946518'], 'claim

In [38]:
print(eval_selection(selection2,"dev-claims.json"))

(0.11812627291242363, 0.07532467532467532, 0.09199048374306106)


## Train Label: embedding + BiLSTM

In [39]:
def encode_labels(labels):
    mapping = {
        'SUPPORTS': [1, 0],
        'REFUTES': [0, 1],
        #'DISPUTED': [1, 1, 0],
        #'NOT_ENOUGH_INFO': [0, 0, 1]
    }
    return np.array([mapping[label] for label in labels])

In [40]:
train_claims_texts = [preprocess_text(claim['claim_text']) for claim in train_claims_data]
train_claim_tfidf = vectorizer.transform(train_claims_texts)

In [41]:
evidence_map = {item['evidence_ID']: item['evidence_text'] for item in evidence_data}

train_evi_texts = []
for claim in train_claims_data:
    evidence_id = claim['evidences'] 
    evidence_text = evidence_map.get(evidence_id, "No evidence text found") 
    train_evi_texts.append(preprocess_text(evidence_text)) 


train_evi_tfidf = vectorizer.transform(train_evi_texts)

In [42]:
train_labels = [claim['claim_label'] for claim in train_claims_data]

In [43]:
print(train_claims_texts)
print(train_evi_texts)
print(train_labels)

['DISPUTED', 'DISPUTED', 'DISPUTED', 'REFUTES', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'SUPPORTS', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'REFUTES', 'REFUTES', 'REFUTES', 'REFUTES', 'REFUTES', 'SUPPORTS', 'SUPPORTS', 'REFUTES', 'REFUTES', 'REFUTES', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'NOT_ENOUGH_INFO', 'SUPPORTS', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'DISPUTED', 'SUPPORTS', 'SUPPORTS', 'REFUTE

In [44]:
print(len(train_evi_texts))

4122


In [45]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from gensim.models import Word2Vec


# 假設 train_claims_data 和 evidence_data 已經是處理好的列表
max_len = 30
vocab_size = 100000
embedding_dim = 2000

model = Word2Vec(evidence_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(evidence_texts)

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue

embedding_layer = Embedding(vocab_size, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False, mask_zero=True)



In [46]:

# filter "DISPUTED" & "NOT_ENOUGH_INFO"
filtered_indices = [i for i, label in enumerate(train_labels) if ((label != 'DISPUTED') and (label != 'NOT_ENOUGH_INFO'))]
filtered_train_claims_texts = [train_claims_texts[i] for i in filtered_indices]
filtered_train_evi_texts = [train_evi_texts[i] for i in filtered_indices]
filtered_train_labels = [train_labels[i] for i in filtered_indices]

# tokenizer
#tokenizer = Tokenizer(num_words=vocab_size)
#tokenizer.fit_on_texts(evidence_texts)
claims_sequences = tokenizer.texts_to_sequences(filtered_train_claims_texts)
evidence_sequences = tokenizer.texts_to_sequences(filtered_train_evi_texts)

# Padding
padded_claims = pad_sequences(claims_sequences, maxlen=max_len, padding='post', truncating='post')
padded_evidences = pad_sequences(evidence_sequences, maxlen=max_len, padding='post', truncating='post')

# 輸入
claims_input = Input(shape=(max_len,), dtype='int32')
evidence_input = Input(shape=(max_len,), dtype='int32')

# 嵌入層
#embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_len, mask_zero=True)
claims_embedded = embedding_layer(claims_input)
evidence_embedded = embedding_layer(evidence_input)

# BiLSTM 層
claims_lstm = Bidirectional(LSTM(1000))(claims_embedded)
evidence_lstm = Bidirectional(LSTM(1000))(evidence_embedded)
claims_lstm = BatchNormalization()(claims_lstm)
evidence_lstm = BatchNormalization()(evidence_lstm)

# Attension
#attention_out = Attention()([claims_lstm, evidence_lstm])
#attention_out = Dropout(0.1)(attention_out)

# 合並兩個 LSTM 輸出
merged = tf.keras.layers.concatenate([claims_lstm, evidence_lstm], axis=-1)
merged = Dropout(0.1)(merged)
#merged = tf.keras.layers.concatenate([attention_out], axis=-1)
dense = Dense(1000, activation='relu')(merged)

# 分類層
predictions = Dense(2, activation='softmax')(dense)

# 建立模型
model = Model(inputs=[claims_input, evidence_input], outputs=predictions)

# 編譯模型
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# 模型摘要
model.summary()

# one-hot 標籤
encoded_train_labels = encode_labels(filtered_train_labels) 

from sklearn.utils.class_weight import compute_class_weight

# 調整balance
class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(filtered_train_labels),y=filtered_train_labels)
class_weights = dict(enumerate(class_weights))

# early stop
callbacks = [
    ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min'),
    EarlyStopping(monitor='val_loss', patience=2, min_delta=0.01, verbose=1)
]

# train
model.fit([padded_claims, padded_evidences], encoded_train_labels, batch_size=32, epochs=10, class_weight=class_weights, callbacks=callbacks)


Epoch 1/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 0.7196 - loss: 0.8604
Epoch 2/20


  self._save_model(epoch=epoch, batch=None, logs=logs)
  current = self.get_monitor_value(logs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 1s/step - accuracy: 0.7617 - loss: 0.5958
Epoch 3/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.7515 - loss: 0.5456
Epoch 4/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.7454 - loss: 0.5462
Epoch 5/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.7527 - loss: 0.5413
Epoch 6/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1s/step - accuracy: 0.7558 - loss: 0.5083
Epoch 7/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1s/step - accuracy: 0.7616 - loss: 0.5145
Epoch 8/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1s/step - accuracy: 0.7650 - loss: 0.4988
Epoch 9/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1s/step - accuracy: 0.7369 - loss: 0.5505
Epoch 10/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x2583dee7640>

In [47]:
print("Class weights: ", class_weights)

Class weights:  {0: 1.9693654266958425, 1: 0.6701414743112435}


In [48]:
print(claims_sequences[:1])
print(padded_claims[:1])
print(encoded_train_labels[:1])

[[968, 9010, 7236, 81, 80, 354, 635, 6939, 1131, 20, 7409, 716]]
[[ 968 9010 7236   81   80  354  635 6939 1131   20 7409  716    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
[[0 1]]


In [49]:
#don't need to redo everytime (id to text)
dev_claim_map = {item['claim_ID']: item['claim_text'] for item in dev_claims_data}
dev_claims_texts = []  
dev_evi_texts = []  


for claim in dev_evidences_pred_jacob:
    claim_id = claim['dev_claim_ID']
    evidence_ids = claim['evidence_IDs'] [:5]

    claim_text = dev_claim_map.get(claim_id, "No claim text found")

    for evidence_id in evidence_ids:
        evidence_text = evidence_map.get(evidence_id, "No evidence text found")

        processed_claim_text = preprocess_text(claim_text)
        processed_evidence_text = preprocess_text(evidence_text)

        dev_claims_texts.append(processed_claim_text)
        dev_evi_texts.append(processed_evidence_text)

In [50]:
print(len(dev_claims_texts))
print(len(dev_evi_texts))
print(len(dev_claims_data))

770
770
154


In [51]:
# predict
dev_claims_sequences = tokenizer.texts_to_sequences(dev_claims_texts)
dev_claims_padded = pad_sequences(dev_claims_sequences, maxlen=max_len)
dev_evidence_sequences = tokenizer.texts_to_sequences(dev_evi_texts)
dev_evidence_padded = pad_sequences(dev_evidence_sequences, maxlen=max_len)

predictions = model.predict([dev_claims_padded, dev_evidence_padded])

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 231ms/step


In [52]:
# 轉換預測結果

def decode_labels(predictions):
    reverse_mapping = {
        (1, 0): 'SUPPORTS',
        (0, 1): 'REFUTES',
        (1, 1): 'DISPUTED',
        #(0, 0, 1): 'NOT_ENOUGH_INFO'
    }

    decoded_labels = []
    for pred in predictions:
        # 概率向量轉換
        predicted_indices = (pred > 0.5).astype(int)
        pred_tuple = tuple(predicted_indices)
        
        # labels = ['SUPPORTS', 'REFUTES', 'DISPUTED']
        if pred_tuple in reverse_mapping:
            decoded_labels.append(reverse_mapping[pred_tuple])
        else:
            # 如果沒有匹配，則選擇最高概率的標籤
            # labels = ['SUPPORTS', 'REFUTES', 'DISPUTED', 'NOT_ENOUGH_INFO']
            # 最高概率太低的情況下標記等等刪除evi (調整evi數量)
            max_prob = max(pred)
            if max_prob < 0.3:
                decoded_labels.append('DELETE EVI') 
            else:
                decoded_labels.append('NOT_ENOUGH_INFO') 

    return decoded_labels

# 將預測結果轉換為文字
predicted_labels = decode_labels(predictions)

In [53]:
print(predictions)
output = pd.DataFrame(predictions)
output.to_csv('predictions.csv', index=False)

[[0.7750407  0.2249593 ]
 [0.81213665 0.1878634 ]
 [0.7982975  0.20170248]
 ...
 [0.93392724 0.0660728 ]
 [0.93532854 0.06467142]
 [0.93392724 0.0660728 ]]


In [54]:
# 假設每個 claim 有 5 個 evidence
num_evidence_per_claim = 5
claim_distribution = []
final_labels_for_claims = [] 

# 遍歷每個 claim
for i in range(0, len(predicted_labels), num_evidence_per_claim):
    # 獲取當前 claim 的所有預測結果
    claim_predictions = predicted_labels[i:i + num_evidence_per_claim]
    
    # 計算每種類別的數量
    label_counts = {
        'SUPPORTS': claim_predictions.count('SUPPORTS'),
        'REFUTES': claim_predictions.count('REFUTES'),
        'DISPUTED': claim_predictions.count('DISPUTED'),
        'NOT_ENOUGH_INFO': claim_predictions.count('NOT_ENOUGH_INFO'),
        'DELETE EVI': claim_predictions.count('DELETE EVI')
    }
    
    # 加到結果列表中
    claim_distribution.append({
        'claim_index': i // num_evidence_per_claim,
        'distribution': label_counts
    })

    # 確定每個 claim 的最終標籤為占比最高的標籤
    final_label = max(label_counts, key=label_counts.get)
    final_labels_for_claims.append(final_label)

# 每個 claim 的結果分布及最終標籤
for index, distribution in enumerate(claim_distribution):
    print(f"Claim {distribution['claim_index']} Distribution: {distribution['distribution']}")
    #print(f"Distribution: {distribution['distribution']}")
    print(f"Final Label : {final_labels_for_claims[index]}")

Claim 0 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 0: SUPPORTS
Claim 1 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 1: SUPPORTS
Claim 2 Distribution: {'SUPPORTS': 0, 'REFUTES': 5, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 2: REFUTES
Claim 3 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 3: SUPPORTS
Claim 4 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 4: SUPPORTS
Claim 5 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 5: SUPPORTS
Claim 6 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 6: SUPPORTS
Claim 7 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0}
Final Label for Claim 7: SUPPORTS
Claim 8 Distribut

In [55]:
# 計算準確率
from sklearn.metrics import accuracy_score

y_true = [claim['claim_label'] for claim in dev_claims_data]
y_pred = final_labels_for_claims

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.42


### Check model predict label (用real evidence檢查model predict label的正確率)

In [56]:
dev_claims_texts = []
dev_evi_texts = []
for item in dev_claims_data:
    claim_id = item['claim_ID']
    claim_text = preprocess_text(item['claim_text'])
    evidences = item['evidences']
    for evidence_id in evidences:
        evidence_text = evidence_map.get(evidence_id, "No evidence text found")
        
        processed_evidence_text = preprocess_text(evidence_text)
        
        dev_claims_texts.append(claim_text)
        dev_evi_texts.append(processed_evidence_text)

In [57]:
print(dev_claims_data[1]['evidences'])
print(len(dev_evi_texts))

['evidence-996421', 'evidence-1080858', 'evidence-208053', 'evidence-699212', 'evidence-832334']
491


In [58]:

dev_claims_sequences = tokenizer.texts_to_sequences(dev_claims_texts)
dev_claims_padded = pad_sequences(dev_claims_sequences, maxlen=max_len)
dev_evidence_sequences = tokenizer.texts_to_sequences(dev_evi_texts)
dev_evidence_padded = pad_sequences(dev_evidence_sequences, maxlen=max_len)


predictions = model.predict([dev_claims_padded, dev_evidence_padded])

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 219ms/step


In [59]:
predicted_labels = decode_labels(predictions)

label_index = 0  
final_labels_for_claims = []  
claim_distribution = [] 

for claim_data in dev_claims_data:
    evidences = claim_data['evidences']
    num_evidences = len(evidences) 


    claim_predictions = predicted_labels[label_index:label_index + num_evidences]
    label_index += num_evidences


    label_counts = {
        'SUPPORTS': claim_predictions.count('SUPPORTS'),
        'REFUTES': claim_predictions.count('REFUTES'),
        'DISPUTED': claim_predictions.count('DISPUTED'),
        'NOT_ENOUGH_INFO': claim_predictions.count('NOT_ENOUGH_INFO'),
        'DELETE EVI': claim_predictions.count('DELETE EVI')
    }
    

    claim_distribution.append({
        'claim_index': claim_data['claim_ID'],
        'distribution': label_counts
    })


    final_label = max(label_counts, key=label_counts.get) if max(label_counts.values()) > 0 else 'NOT_ENOUGH_INFO'
    final_labels_for_claims.append(final_label)


for index, distribution in enumerate(claim_distribution):
    print(f"Claim {distribution['claim_index']} Distribution: {distribution['distribution']}")
    print(f"Final Label for Claim {distribution['claim_index']}: {final_labels_for_claims[index]}")

Claim claim-752 Distribution: {'SUPPORTS': 2, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-752: SUPPORTS
Claim claim-375 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-375: SUPPORTS
Claim claim-1266 Distribution: {'SUPPORTS': 0, 'REFUTES': 2, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-1266: REFUTES
Claim claim-871 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-871: SUPPORTS
Claim claim-2164 Distribution: {'SUPPORTS': 4, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-2164: SUPPORTS
Claim claim-1607 Distribution: {'SUPPORTS': 5, 'REFUTES': 0, 'DISPUTED': 0, 'NOT_ENOUGH_INFO': 0, 'DELETE EVI': 0}
Final Label for Claim claim-1607: SUPPORTS
Claim claim-761 Distribution: {'SUPPORTS': 5, 'REFUTES': 0,

In [60]:
from sklearn.metrics import accuracy_score

y_true = [claim['claim_label'] for claim in dev_claims_data]
y_pred = final_labels_for_claims

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.42


In [61]:
print(predictions)

[[7.51341999e-01 2.48658016e-01]
 [7.75040686e-01 2.24959299e-01]
 [9.04359758e-01 9.56401899e-02]
 [9.10218477e-01 8.97815078e-02]
 [9.04359758e-01 9.56401899e-02]
 [9.03332233e-01 9.66677889e-02]
 [9.03755188e-01 9.62448120e-02]
 [4.99175489e-02 9.50082481e-01]
 [7.16961473e-02 9.28303897e-01]
 [9.38978672e-01 6.10213280e-02]
 [9.33685422e-01 6.63146153e-02]
 [9.12092030e-01 8.79079551e-02]
 [9.44512010e-01 5.54879420e-02]
 [9.33685422e-01 6.63146153e-02]
 [8.14061701e-01 1.85938269e-01]
 [8.11778724e-01 1.88221261e-01]
 [8.29571366e-01 1.70428693e-01]
 [8.25455308e-01 1.74544677e-01]
 [7.57772624e-01 2.42227361e-01]
 [7.46592104e-01 2.53407896e-01]
 [7.51909316e-01 2.48090670e-01]
 [7.51341999e-01 2.48658016e-01]
 [7.51341999e-01 2.48658016e-01]
 [9.21164572e-01 7.88354129e-02]
 [8.92001033e-01 1.07998952e-01]
 [9.18906987e-01 8.10930580e-02]
 [9.18906987e-01 8.10930580e-02]
 [8.92001033e-01 1.07998952e-01]
 [8.88055146e-01 1.11944854e-01]
 [8.94280791e-01 1.05719186e-01]
 [8.806579