In [16]:
from elasticsearch import Elasticsearch
import configparser
from pathlib import Path
import json
from tqdm import tqdm

## Connect to elasticsearch
### To do so, you have to create a deployment in [elasticsearch website](https://cloud.elastic.co/deployments) and then fill in the information in [credential.ini](credential.ini)

In [17]:
config = configparser.ConfigParser()
config.read('credential.ini')

['credential.ini']

In [None]:
es = Elasticsearch(
    cloud_id=config['ELASTIC']['cloud_id'],
    basic_auth=(config['ELASTIC']['user'], config['ELASTIC']['password'])
)

or you could create a local server, [link](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) to see how to install local server.

### Ingest data with python on Elasticsearch Service
#### it may take 8-10 hours to ingest all wiki pages into elasticsearh database

In [None]:
from time import sleep
for page_id in range(1, 25):
    wiki_path = Path(f'../data/wiki-pages/wiki-{str(page_id).zfill(3)}.jsonl')
    print(f'Ingesting wiki-{str(page_id).zfill(3)}...')
    page_sum = sum([1 for i in open(wiki_path, 'r')])
    with open(wiki_path, 'r') as f:
        for doc_id, line in tqdm(enumerate(f), total=page_sum):
            es.index(
                index='wiki-page',
                id=f'{str(page_id).zfill(3)}-{str(doc_id+1).zfill(5)}',
                document=json.loads(line)
            )

In [None]:
es.indices.refresh(index='wiki-page')

### check total number of wiki pages

In [None]:
index_name = 'wiki-page'
query = {
    'query': {
        'match_all': {}
    }
}

response = es.count(index=index_name, body=query)

total_count = response['count']
print(total_count)

### 找 wiki page 位置

In [None]:
index_name = "wiki-page"

page_name = "臺南市安平水產專修學校"

# 構建查詢
query = {
    "query": {
        "match": {
            "id": page_name
        }
    },
    'size': 1  # default size: 10
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

print(hits[0])

### 全文檢索
### you can try any claim and see top 10 related wiki pages. 

In [None]:
index_name = "wiki-page"

claim = input("claim:")

# 構建查詢
query = {
    "query": {
        "match": {
            "text": claim
        }
    },
    'size': 50,  # default size: 10
    "track_total_hits": True
}

# 執行查詢
response = es.search(index=index_name, body=query)

# 解析結果
hits = response["hits"]["hits"]
related_pages = [(hit["_id"], hit['_source']['id'], hit["_score"]) for hit in hits]

# 打印相關頁面信息
for page_id, name, score in related_pages:
    print(f"Page ID: {page_id}, Name: {name}, Score: {score}")

In [None]:
def get_wiki_page(claim, size=5):
    index_name = "wiki-page"
    claim = claim

    # 構建查詢
    query = {
        "query": {
            "match": {
                "text": claim
            }
        },
        'size': size,  # default size: 10
        "track_total_hits": True
    }

    # 執行查詢
    response = es.search(index=index_name, body=query)

    hits = response["hits"]["hits"]
    
    return [hit['_source']['id'] for hit in hits]

### Save top 10 similarities wiki page with each claim

In [None]:
modes = ['train', 'test']
dir.mkdir(parents=True, exist_ok=True)
for mode in modes:
    path = Path(f'../data/all_{mode}.jsonl')
    results = []
    SIZE = 10
    total_instance = sum([1 for i in open(path, 'r')])
    with open(path, 'r') as f:
        for i, line in tqdm(enumerate(f), total=total_instance):
            results.append(get_wiki_page(claim=json.loads(line)['claim'], size=SIZE))

    save_path = Path(f'../cache/all_es_{mode}_token_{SIZE}.txt')
    with open(save_path, 'w') as f:
        for re in results:
            f.write(' '.join(re))
            f.write('\n')