In [None]:
import os
import sys
import gokart
from elasticsearch import Elasticsearch

from qrelllm.eval import ndcg_compare_report
from qrelllm.queries import LoadQueries
from qrelllm.llm.vertex import TestCollection
from qrelllm.llm.openai import RelDecision
from qrelllm.eval import CohenKappa
from qrelllm.format import format_test_collection
from qrelllm.es.es import ping, index, run_with_kuromoji, run_with_ngram

In [None]:
project = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
location = os.getenv("GOOGLE_CLOUD_LOCATION")

## テストコレクション生成

In [None]:
index_name = "docs"
client = Elasticsearch(hosts=["http://localhost:9200"])
ping(client)

queries = LoadQueries(csv_file_path="../data/queries.csv")
testcollection = TestCollection(
    project=project,
    location=location,
    queries=queries,
    size=300
)


## 文字列統計量の比較

In [None]:
# compare.ipynbへ

## 判定者間不一致の評価

In [None]:
testcollection_a = testcollection
testcollection_b = RelDecision(testcollection=testcollection_a)

gokart.build(CohenKappa(testcollection_a=testcollection_a, testcollection_b=testcollection_b, rerun=True))

## オフライン評価

In [None]:
df = gokart.build(testcollection)
df = format_test_collection(df)

index(client, df)

run1_df = run_with_ngram(client, index_name, df)
run2_df = run_with_kuromoji(client, index_name, df)
report = ndcg_compare_report(df, run1_df, run2_df)
print(report)

In [None]:
report_dict = report.to_dict()
score_a = report_dict['run_1']["scores"]["ndcg@10"]
score_b = report_dict['run_2']["scores"]["ndcg@10"]
p_value = report_dict['run_1']['comparisons']['run_2']["ndcg@10"]

# score_bがscore_aに有意に負けてたら落とす
if score_a > score_b and p_value < 0.05:
    print(report)
    sys.exit(1)