In [1]:
from elasticsearch import Elasticsearch
import configparser
from pathlib import Path
import json
from tqdm import tqdm

In [2]:
config = configparser.ConfigParser()
config.read('credential.ini')

['credential.ini']

In [3]:
es = Elasticsearch(
    cloud_id=config['ELASTIC']['cloud_id'],
    basic_auth=(config['ELASTIC']['user'], config['ELASTIC']['password'])
)

### Ingest data with python on Elasticsearch Service

In [4]:
# from time import sleep
# for page_id in range(1, 25):
#     wiki_path = Path(f'../data/wiki-pages/wiki-{str(page_id).zfill(3)}.jsonl')
#     print(f'Ingesting wiki-{str(page_id).zfill(3)}...')
#     page_sum = sum([1 for i in open(wiki_path, 'r')])
#     with open(wiki_path, 'r') as f:
#         for doc_id, line in tqdm(enumerate(f), total=page_sum):
#             es.index(
#                 index='wiki-page',
#                 id=f'{str(page_id).zfill(3)}-{str(doc_id+1).zfill(5)}',
#                 document=json.loads(line)
#             )

In [5]:
es.indices.refresh(index='wiki-page')

ObjectApiResponse({'_shards': {'total': 2, 'successful': 2, 'failed': 0}})

In [6]:

text = '中國人徐翔曾因爲涉嫌操縱證券市場及內幕交易犯罪，被公安機關依法批准逮捕。'

result = es.search(
    index='wiki-page',
    query={
        'match':{
            'id': text
        }
    }
)


hits = result['hits']['hits']

for hit in hits:
    document_id = hit["_id"]
    score = hit["_score"]
    page_name = hit['_source']['id']
    print(f"Document ID: {document_id}, Name: {page_name}, Score: {score}")

Document ID: 014-26276, Name: 逮捕證, Score: 33.169304
Document ID: 004-17505, Name: 證券交易法, Score: 31.88828
Document ID: 001-38486, Name: 犯罪嫌疑人, Score: 31.809181
Document ID: 012-33272, Name: 證券交易法_(中華民國), Score: 29.654058
Document ID: 018-17290, Name: 中國證券交易所, Score: 29.618668
Document ID: 007-42209, Name: 犯罪心理：嫌犯動機, Score: 29.317818
Document ID: 007-36582, Name: 公安部證券犯罪偵查局, Score: 28.544613
Document ID: 004-15588, Name: 河內證券交易所, Score: 27.575668
Document ID: 003-08179, Name: 逮捕, Score: 27.389196
Document ID: 020-48523, Name: 公民逮捕, Score: 27.312943


In [7]:
index_name = 'wiki-page'
query = {
    'query': {
        'match_all': {}
    }
}

response = es.count(index=index_name, body=query)

total_count = response['count']
print(total_count)

1187751


  response = es.count(index=index_name, body=query)


### 找 wiki page 位置

In [8]:
index_name = "wiki-page"

page_name = "臺南市安平水產專修學校"

# 構建查詢
query = {
    "query": {
        "match": {
            "id": page_name
        }
    },
    'size': 1  # default size: 10
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

print(hits[0])

  response = es.search(index=index_name, body=query)


Page ID: 022-11284, Name: 臺南市立安平水產專修學校, Score: 35.76047
{'_index': 'wiki-page', '_id': '022-11284', '_score': 35.76047, '_source': {'id': '臺南市立安平水產專修學校', 'text': '臺南市立安平水產專修學校爲臺灣日治時期位於臺南市安平的水產類實業補習學校 ， 設立於1930年 ， 在1939即停辦 。', 'lines': '0\t臺南市立安平水產專修學校爲臺灣日治時期位於臺南市安平的水產類實業補習學校 ， 設立於1930年 ， 在1939即停辦 。\t\n1\t'}}


### 全文檢索

In [16]:
index_name = "wiki-page"

claim = "木衛三十九被以希臘神話中的美惠五女神之一的名稱命名。"

# 構建查詢
query = {
    "query": {
        "match": {
            "text": claim
        }
    },
    'size': 50,  # default size: 10
    "track_total_hits": True
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

  response = es.search(index=index_name, body=query)


Page ID: 001-09687, Name: 海衛七, Score: 56.93269
Page ID: 001-09619, Name: 木衛十六, Score: 55.367805
Page ID: 001-09635, Name: 木衛三十七, Score: 53.80496
Page ID: 014-41174, Name: 潘撞擊坑, Score: 53.483047
Page ID: 015-02509, Name: 帕西忒亞, Score: 53.32935
Page ID: 006-48182, Name: 帕耳開, Score: 52.88429
Page ID: 001-09609, Name: 木衛十, Score: 52.364555
Page ID: 006-48810, Name: 土衛五十三, Score: 52.347473
Page ID: 001-09633, Name: 木衛三十三, Score: 51.89009
Page ID: 015-00153, Name: 麥萊亞戈, Score: 51.65578
Page ID: 015-02919, Name: 赫革摩涅, Score: 51.603306
Page ID: 001-09616, Name: 木衛十七, Score: 51.416946
Page ID: 002-36245, Name: 鬩衛一, Score: 50.917206
Page ID: 021-28334, Name: Arche_(消歧義), Score: 50.894054
Page ID: 001-09693, Name: 海衛十三, Score: 50.790874
Page ID: 014-40741, Name: 卡里斯_(希臘神話), Score: 50.709526
Page ID: 001-09638, Name: 木衛三十四, Score: 50.536514
Page ID: 017-46521, Name: 艾菲蜜, Score: 50.460243
Page ID: 001-09634, Name: 木衛三十五, Score: 50.383636
Page ID: 015-02907, Name: 卡勒_(希臘神話), Score: 50.28047
Page ID: 

## score-base search

In [17]:
index_name = "wiki-page"

claim = "天衛三軌道在天王星內部的磁層，以《 仲夏夜之夢 》作者緹坦妮雅命名。"

# 構建查詢
query = {
    "query": {
        "match": {
            "text": claim
        }
    },
    'size': 50,  # default size: 10
    "track_total_hits": True
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]
highest_score = related_pages[0][2]
SCORE_BOUND = 5
# 打印相關頁面信息
for page_id, name, score in related_pages:
    if highest_score - score <= SCORE_BOUND:
        print(f"Page ID: {page_id}, Name: {name}, Score: {score}")
    else:
        break


  response = es.search(index=index_name, body=query)


Page ID: 020-45171, Name: 緹坦妮雅, Score: 65.25459
Page ID: 001-09662, Name: 天衛三, Score: 63.515965


In [18]:
def get_wiki_page(claim, size=5):
    index_name = "wiki-page"
    claim = claim

    # 構建查詢
    query = {
        "query": {
            "match": {
                "text": claim
            }
        },
        'size': size,  # default size: 10
        "track_total_hits": True
    }

    # 執行查詢
    response = es.search(index=index_name, body=query)

    hits = response["hits"]["hits"]
    
    return [hit['_source']['id'] for hit in hits]

In [20]:
def get_wiki_page_by_score(claim, bound):
    index_name = "wiki-page"
    claim = claim

    # 構建查詢
    query = {
        "query": {
            "match": {
                "text": claim
            }
        },
        'size': 50,  # default size: 10
        "track_total_hits": True
    }

    # 執行查詢
    result = []
    response = es.search(index=index_name, body=query)

    hits = response["hits"]["hits"]
    highest_score = hits[0]['_score']
    for hit in hits:
        if highest_score - hit['_score'] <= bound:
            result.append(hit['_source']['id'])
    
    return result

In [21]:
claim = '位於西南亞的阿曼蘇丹國南部和東部臨太平洋。'
get_wiki_page(claim, size=5)

  response = es.search(index=index_name, body=query)


['中國海', '阿曼', '厄立特里亞地理', '西里伯斯海', '緬甸地理']

In [22]:
claim = '位於西南亞的阿曼蘇丹國南部和東部臨太平洋。'
get_wiki_page_by_score(claim, bound=4.5)

  response = es.search(index=index_name, body=query)


['中國海', '阿曼', '厄立特里亞地理', '西里伯斯海', '緬甸地理', '阿納蘭吉魯富大區']

In [23]:
train_path = Path('../data/public_test.jsonl')
results = []
SIZE = 10
total_instance = sum([1 for i in open(train_path, 'r')])
with open(train_path, 'r') as f:
    for i, line in tqdm(enumerate(f), total=total_instance):
        results.append(get_wiki_page(claim=json.loads(line)['claim'], size=SIZE))

save_path = Path(f'../cache/es_test_token_{SIZE}.txt')
with open(save_path, 'w') as f:
    for re in results:
        f.write(' '.join(re))
        f.write('\n')

  response = es.search(index=index_name, body=query)
100%|██████████| 989/989 [03:17<00:00,  5.00it/s]


In [None]:
# train_path = Path('../data/public_train.jsonl')
# results = []
# SCORE_BOUND = 5
# MIN_RTV = 1
# with open(train_path, 'r') as f:
#     for i, line in enumerate(f):
#         result = get_wiki_page_by_score(claim=json.loads(line)['claim'], bound=SCORE_BOUND)
#         sum = len(result)
#         if MIN_RTV == 1:
#             continue
#         if sum < MIN_RTV:
#             result += get_wiki_page(claim=json.loads(line)['claim'], size=MIN_RTV)[sum:]
#         results.append(result)
# save_path = Path(f'../cache/es_train_bound_{SCORE_BOUND}_minrtv_{MIN_RTV}.txt')
# with open(save_path, 'w') as f:
#     for re in results:
#         f.write(' '.join(re))
#         f.write('\n')

In [None]:
import wikipedia
wikipedia.set_lang("zh")

result = wikipedia.search('民進黨')
print(result)