In [None]:
!pip install elasticsearch

In [1]:
from elasticsearch import Elasticsearch
import configparser
from pathlib import Path
import json
from tqdm import tqdm

ModuleNotFoundError: No module named 'elasticsearch'

In [None]:
config = configparser.ConfigParser()
config.read('credential.ini')

In [None]:
es = Elasticsearch(
    cloud_id=config['ELASTIC']['cloud_id'],
    basic_auth=(config['ELASTIC']['user'], config['ELASTIC']['password'])
)

### Ingest data with python on Elasticsearch Service

In [None]:
# from time import sleep
# for page_id in range(1, 25):
#     wiki_path = Path(f'../data/wiki-pages/wiki-{str(page_id).zfill(3)}.jsonl')
#     print(f'Ingesting wiki-{str(page_id).zfill(3)}...')
#     page_sum = sum([1 for i in open(wiki_path, 'r')])
#     with open(wiki_path, 'r') as f:
#         for doc_id, line in tqdm(enumerate(f), total=page_sum):
#             es.index(
#                 index='wiki-page',
#                 id=f'{str(page_id).zfill(3)}-{str(doc_id+1).zfill(5)}',
#                 document=json.loads(line)
#             )

In [None]:
es.indices.refresh(index='wiki-page')

In [None]:

text = '中國人徐翔曾因爲涉嫌操縱證券市場及內幕交易犯罪，被公安機關依法批准逮捕。'

result = es.search(
    index='wiki-page',
    query={
        'match':{
            'id': text
        }
    }
)


hits = result['hits']['hits']

for hit in hits:
    document_id = hit["_id"]
    score = hit["_score"]
    page_name = hit['_source']['id']
    print(f"Document ID: {document_id}, Name: {page_name}, Score: {score}")

In [None]:
index_name = 'wiki-page'
query = {
    'query': {
        'match_all': {}
    }
}

response = es.count(index=index_name, body=query)

total_count = response['count']
print(total_count)

### 找 wiki page 位置

In [None]:
index_name = "wiki-page"

page_name = "臺南市安平水產專修學校"

# 構建查詢
query = {
    "query": {
        "match": {
            "id": page_name
        }
    },
    'size': 1  # default size: 10
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

print(hits[0])

### 全文檢索

In [None]:
index_name = "wiki-page"

claim = "木衛三十九被以希臘神話中的美惠五女神之一的名稱命名。"

# 構建查詢
query = {
    "query": {
        "match": {
            "text": claim
        }
    },
    'size': 50,  # default size: 10
    "track_total_hits": True
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

## score-base search

In [None]:
index_name = "wiki-page"

claim = "天衛三軌道在天王星內部的磁層，以《 仲夏夜之夢 》作者緹坦妮雅命名。"

# 構建查詢
query = {
    "query": {
        "match": {
            "text": claim
        }
    },
    'size': 50,  # default size: 10
    "track_total_hits": True
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]
highest_score = related_pages[0][2]
SCORE_BOUND = 5
# 打印相關頁面信息
for page_id, name, score in related_pages:
    if highest_score - score <= SCORE_BOUND:
        print(f"Page ID: {page_id}, Name: {name}, Score: {score}")
    else:
        break


In [None]:
def get_wiki_page(claim, size=5):
    index_name = "wiki-page"
    claim = claim

    # 構建查詢
    query = {
        "query": {
            "match": {
                "text": claim
            }
        },
        'size': size,  # default size: 10
        "track_total_hits": True
    }

    # 執行查詢
    response = es.search(index=index_name, body=query)

    hits = response["hits"]["hits"]
    
    return [hit['_source']['id'] for hit in hits]

In [None]:
def get_wiki_page_by_score(claim, bound):
    index_name = "wiki-page"
    claim = claim

    # 構建查詢
    query = {
        "query": {
            "match": {
                "text": claim
            }
        },
        'size': 50,  # default size: 10
        "track_total_hits": True
    }

    # 執行查詢
    result = []
    response = es.search(index=index_name, body=query)

    hits = response["hits"]["hits"]
    highest_score = hits[0]['_score']
    for hit in hits:
        if highest_score - hit['_score'] <= bound:
            result.append(hit['_source']['id'])
    
    return result

In [None]:
claim = '位於西南亞的阿曼蘇丹國南部和東部臨太平洋。'
get_wiki_page(claim, size=5)

In [None]:
claim = '位於西南亞的阿曼蘇丹國南部和東部臨太平洋。'
get_wiki_page_by_score(claim, bound=4.5)

In [None]:
train_path = Path('../data/public_test.jsonl')
results = []
SIZE = 10
total_instance = sum([1 for i in open(train_path, 'r')])
with open(train_path, 'r') as f:
    for i, line in tqdm(enumerate(f), total=total_instance):
        results.append(get_wiki_page(claim=json.loads(line)['claim'], size=SIZE))

save_path = Path(f'../cache/es_test_token_{SIZE}.txt')
with open(save_path, 'w') as f:
    for re in results:
        f.write(' '.join(re))
        f.write('\n')

In [None]:
# train_path = Path('../data/public_train.jsonl')
# results = []
# SCORE_BOUND = 5
# MIN_RTV = 1
# with open(train_path, 'r') as f:
#     for i, line in enumerate(f):
#         result = get_wiki_page_by_score(claim=json.loads(line)['claim'], bound=SCORE_BOUND)
#         sum = len(result)
#         if MIN_RTV == 1:
#             continue
#         if sum < MIN_RTV:
#             result += get_wiki_page(claim=json.loads(line)['claim'], size=MIN_RTV)[sum:]
#         results.append(result)
# save_path = Path(f'../cache/es_train_bound_{SCORE_BOUND}_minrtv_{MIN_RTV}.txt')
# with open(save_path, 'w') as f:
#     for re in results:
#         f.write(' '.join(re))
#         f.write('\n')

In [None]:
import wikipedia
wikipedia.set_lang("zh")

result = wikipedia.search('民進黨')
print(result)