In [1]:
import pandas as pd
import json
import re
import spacy
import nltk
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from collections import defaultdict


In [2]:
# 加载声明数据
claim_file_path = "data/train-claims.json"
with open(claim_file_path, "r") as file:
    claims_data = json.load(file)

# 加载证据数据
evidence_file_path = "data/evidence.json"
with open(evidence_file_path, "r") as file:
    evidence_data = json.load(file)


In [3]:
# 转换声明数据为DataFrame
claims = []
for claim_id, details in claims_data.items():
    claims.append({'claim_id': claim_id, **details})
claims_df = pd.DataFrame(claims)

# 转换证据数据为DataFrame
evidences = []
for ev_id, text in evidence_data.items():
    evidences.append({'evidence_id': ev_id, 'text': text})
evidence_df = pd.DataFrame(evidences)



In [13]:
print(evidence_df[evidence_df['evidence_id'] ==  'evidence-545412'].values[0])

['evidence-545412'
 "The Bureau (original title : Le Bureau des légendes) is a French political thriller television series created by Éric Rochant and produced by Canal +, which revolves around the lives of agents of the DGSE (General Directorate of External Security), France 's principal external security service."]


In [22]:
# 加载Spacy英语模型
nlp = spacy.load("en_core_web_sm")

# 创建NER映射表并进行实体提取
ner_map = defaultdict(list)

for index, row in evidence_df.iterrows():
    doc = nlp(row['text'])
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
    ner_map[row['evidence_id']] = entities  # 保存每个证据的实体列表

# 将实体信息添加到DataFrame
evidence_df['entities'] = evidence_df['evidence_id'].map(ner_map)


In [23]:
def filter_text(text, entities):
    """保留英文单词和特定模式（如化学符号、数字与字母的组合等），同时保留已识别的实体"""
    pattern = re.compile(r'\b[a-zA-Z0-9]+\b')
    tokens = word_tokenize(text)
    entities = set(entities)  # 将实体列表转换为集合以快速检查
    filtered_tokens = [token for token in tokens if pattern.match(token) or token in entities]
    return " ".join(filtered_tokens)

# 应用文本过滤，同时考虑实体
claims_df['claim_text'] = claims_df.apply(lambda row: filter_text(row['claim_text'], row.get('entities', [])), axis=1)
evidence_df['text'] = evidence_df.apply(lambda row: filter_text(row['text'], row['entities']), axis=1)


In [24]:
def tokenize_with_entities(text, entities):
    """使用Spacy进行分词，并对实体进行加权处理"""
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    # 增加实体的出现次数
    words.extend([entity.lower() for entity in entities for _ in range(3)])  # 实体权重增加，出现3次
    return words

# 应用分词并考虑实体
evidence_df['tokens'] = evidence_df.apply(lambda row: tokenize_with_entities(row['text'], row['entities']), axis=1)
bm25 = BM25Okapi(evidence_df['tokens'].tolist())


In [27]:
def filter_text(text):
    """保留英文单词和特定模式（如化学符号、数字与字母的组合等）"""
    pattern = re.compile(r'\b[a-zA-Z0-9]+\b')  # 识别字母和数字的组合
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if pattern.match(token)]
    return " ".join(filtered_tokens)
# 应用文本过滤函数到正确的列
dev_claims_df['claim_text'] = dev_claims_df['claim_text'].apply(filter_text)


In [28]:
# 加载开发集数据
dev_claim_file_path = "data/dev-claims.json"
with open(dev_claim_file_path, "r") as file:
    dev_claims_data = json.load(file)

dev_claims = []
for claim_id, details in dev_claims_data.items():
    dev_claims.append({'claim_id': claim_id, **details})
dev_claims_df = pd.DataFrame(dev_claims)

# 应用文本过滤函数到正确的列
dev_claims_df['claim_text'] = dev_claims_df['claim_text'].apply(filter_text)


In [29]:
def get_top_n_evidence(claim_text, entities, top_n=20):
    # 为声明文本和相关实体构建查询令牌
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]

# 检索证据并计算准确率
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")


Accuracy: 0.2973523421588595


In [30]:
def get_top_n_evidence(claim_text, entities, top_n=40):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.3890020366598778


In [31]:
def get_top_n_evidence(claim_text, entities, top_n=60):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_60.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.42973523421588594


In [32]:
def get_top_n_evidence(claim_text, entities, top_n=80):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_80.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.4623217922606925


In [33]:
def get_top_n_evidence(claim_text, entities, top_n=100):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_100.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.4908350305498982


In [34]:
def get_top_n_evidence(claim_text, entities, top_n=150):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_150.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.5193482688391039
