In [1]:
import pandas as pd
from utils.Evaluation import *
from utils.Loader import *
from utils.Retriever import *
from sentence_transformers import SentenceTransformer
from FlagEmbedding import LightWeightFlagLLMReranker

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


### Configuration

In [2]:
# Specify the file where you want to store the dataset
Config.faq_path = Path('../data/reference/faq/pid_map_content.json')
Config.finance_path = Path('../data/reference/finance')
Config.insurance_path = Path('../data/reference/insurance')
# Specify the file where you want to store the ground truth file
Config.truth_path = Path('../data/dataset/preliminary/ground_truths_example.json')
# Specify the file where you want to store the baseline prediction file
Config.prediction_path = Path('../data/dataset/preliminary/pred_retrieve.json')
# Specify the file where you want to store the own method(in here we use hybrid retrieve + reranker) prediction file
Config.my_prediction_path = Path('../data/dataset/my_ans/pred_retrieve.json')
# Specify the file where you want to store the question file
Config.queries_info_path = Path('../data/dataset/preliminary/questions_example.json')
# Specify the directory where you want to store the embedding model and reranker model
Config.model_cache = Path('/HDD/model_cache/')

### Load Data
Prepare two types of datasets: one for BM25 (chunk size = 1 page of the PDF) and another for vector embedding (chunk size = 512 tokens). Additionally, we separate the datasets by their category (e.g., insurance, finance, FAQ).

In [3]:
bm25_insurance_corpus_df = load_data(source_path=Config.insurance_path, is_chunking=False)
bm25_finance_corpus_df = load_data(source_path=Config.finance_path, is_chunking=False)
bm25_faq_corpus_df = load_faq(Config.faq_path)

vector_insurance_corpus_df = load_data(source_path=Config.insurance_path, is_chunking=True)
vector_finance_corpus_df = load_data(source_path=Config.finance_path, is_chunking=True)
vector_faq_corpus_df = load_faq(Config.faq_path)

Loading data: 100%|██████████| 643/643 [00:02<00:00, 291.48it/s]
Loading data: 100%|██████████| 1035/1035 [00:19<00:00, 51.86it/s]
Loading data: 100%|██████████| 617/617 [00:00<00:00, 566276.93it/s]
Loading data: 100%|██████████| 643/643 [00:02<00:00, 293.36it/s]
Loading data: 100%|██████████| 1035/1035 [00:20<00:00, 50.23it/s]
Loading data: 100%|██████████| 617/617 [00:00<00:00, 512655.62it/s]


Load queries for three category

In [4]:
# Config.queries_info_path: 訓練資料集query
# Config.formal_queries_info_path: 正式比賽query
queries_info_df = get_queried_info(Config.queries_info_path) # 正式比賽query: Config.queries_info_path
insurance_queries_info_df = queries_info_df[queries_info_df['category'] == 'insurance']
finance_queries_info_df = queries_info_df[queries_info_df['category'] == 'finance']
faq_queries_info_df = queries_info_df[queries_info_df['category'] == 'faq']

Load the intfloat/multilingual-e5-large dense vector embedding model for hybrid retrieval.

In [5]:
embedder = SentenceTransformer("intfloat/multilingual-e5-large", cache_folder='/HDD/model_cache')

Calculate the BM25 and embedding vector scores, along with the rankings, for the dataset with respect to each query.

In [6]:
insurance_bm25_retrieve: pd.DataFrame = BM25Retrieve.retrieve(queries_info=insurance_queries_info_df, 
                                                corpus_df = bm25_insurance_corpus_df)
finance_bm25_retrieve: pd.DataFrame = BM25Retrieve.retrieve(queries_info=finance_queries_info_df, 
                                                corpus_df = bm25_finance_corpus_df)
faq_bm25_retrieve: pd.DataFrame = BM25Retrieve.retrieve(queries_info=faq_queries_info_df,
                                                     corpus_df = bm25_faq_corpus_df)
insurance_vector_retrieve: pd.DataFrame = VectorRetriever.retrieve(embedder, queries_info=insurance_queries_info_df,
                                                     corpus_df = vector_insurance_corpus_df)
finance_vector_retrieve: pd.DataFrame = VectorRetriever.retrieve(embedder, queries_info=finance_queries_info_df,
                                                     corpus_df = vector_finance_corpus_df)
faq_vector_retrieve: pd.DataFrame = VectorRetriever.retrieve(embedder, queries_info=faq_queries_info_df,
                                                     corpus_df = vector_faq_corpus_df)

  0%|          | 0/50 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.423 seconds.
Prefix dict has been built successfully.
100%|██████████| 50/50 [00:02<00:00, 19.54it/s]
100%|██████████| 50/50 [00:02<00:00, 19.08it/s]
100%|██████████| 50/50 [00:00<00:00, 171.05it/s]
100%|██████████| 50/50 [00:30<00:00,  1.65it/s]
100%|██████████| 50/50 [00:45<00:00,  1.09it/s]
100%|██████████| 50/50 [00:06<00:00,  8.23it/s]


Perform hybrid retrieval using RRF (refer to README.md for details).

In [7]:
insurance_hybrid_retrieve_pd: pd.DataFrame = get_hybrid_retrieve_pd(bm25_retrieve_pd = insurance_bm25_retrieve, 
                                            vector_retrieve_pd = insurance_vector_retrieve)
finance_hybrid_retrieve_pd: pd.DataFrame = get_hybrid_retrieve_pd(bm25_retrieve_pd = finance_bm25_retrieve, 
                                            vector_retrieve_pd = finance_vector_retrieve)
faq_hybrid_retrieve_pd: pd.DataFrame = get_hybrid_retrieve_pd(bm25_retrieve_pd = faq_bm25_retrieve, 
                                            vector_retrieve_pd = faq_vector_retrieve)

insurance_hybrid_rrf_retrieve_pd: pd.DataFrame = get_RRF_score(insurance_hybrid_retrieve_pd)
finance_hybrid_rrf_retrieve_pd: pd.DataFrame = get_RRF_score(finance_hybrid_retrieve_pd)
faq_hybrid_rrf_retrieve_pd: pd.DataFrame = get_RRF_score(faq_hybrid_retrieve_pd)

Load the reranker model

In [8]:
reranker1 = LightWeightFlagLLMReranker(
    'BAAI/bge-reranker-v2.5-gemma2-lightweight', 
    query_max_length=256,
    passage_max_length=512,
    use_fp16=True,
    devices=['cuda:1'],
    cache_dir = Config.model_cache
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  8.29it/s]


Pair up the queries with their corresponding retrieved sentences, then apply a reranker.

In [9]:
reranked_insurance_retrieve = apply_reranking(reranker=reranker1, hybrid_retrieve=insurance_hybrid_rrf_retrieve_pd)
reranked_finance_retrieve = apply_reranking(reranker=reranker1, hybrid_retrieve=finance_hybrid_rrf_retrieve_pd)
reranked_faq_retrieve = apply_reranking(reranker=reranker1 , hybrid_retrieve=faq_hybrid_rrf_retrieve_pd)

pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 114.11it/s]
You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 3/3 [00:15<00:00,  5.10s/it]
pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 93.45it/s]
100%|██████████| 3/3 [00:19<00:00,  6.64s/it]
pre tokenize: 100%|██████████| 6/6 [00:00<00:00, 190.15it/s]
100%|██████████| 6/6 [00:11<00:00,  1.87s/it]


Obtain the final retrieved answers based on the re-ranking scores.

In [10]:
insurance_top_1_retrieve_df = get_top_1_retrieve_pd(reranked_insurance_retrieve, score_field='reranking_score')
finance_top_1_retrieve_df = get_top_1_retrieve_pd(reranked_finance_retrieve, score_field='reranking_score')
faq_top_1_retrieve_df = get_top_1_retrieve_pd(reranked_faq_retrieve, score_field='reranking_score')

In [11]:
generate_answer_json([insurance_top_1_retrieve_df, finance_top_1_retrieve_df, faq_top_1_retrieve_df])

{'answers': [{'qid': 1, 'retrieve': 392},
  {'qid': 2, 'retrieve': 606},
  {'qid': 3, 'retrieve': 83},
  {'qid': 4, 'retrieve': 186},
  {'qid': 5, 'retrieve': 162},
  {'qid': 6, 'retrieve': 116},
  {'qid': 7, 'retrieve': 107},
  {'qid': 8, 'retrieve': 78},
  {'qid': 9, 'retrieve': 62},
  {'qid': 10, 'retrieve': 472},
  {'qid': 11, 'retrieve': 7},
  {'qid': 12, 'retrieve': 526},
  {'qid': 13, 'retrieve': 526},
  {'qid': 14, 'retrieve': 526},
  {'qid': 15, 'retrieve': 536},
  {'qid': 16, 'retrieve': 54},
  {'qid': 17, 'retrieve': 606},
  {'qid': 18, 'retrieve': 184},
  {'qid': 19, 'retrieve': 315},
  {'qid': 20, 'retrieve': 292},
  {'qid': 21, 'retrieve': 36},
  {'qid': 22, 'retrieve': 614},
  {'qid': 23, 'retrieve': 99},
  {'qid': 24, 'retrieve': 359},
  {'qid': 25, 'retrieve': 4},
  {'qid': 26, 'retrieve': 147},
  {'qid': 27, 'retrieve': 171},
  {'qid': 28, 'retrieve': 298},
  {'qid': 29, 'retrieve': 524},
  {'qid': 30, 'retrieve': 327},
  {'qid': 31, 'retrieve': 10},
  {'qid': 32, 're

Evaluate the performance of hybrid retrieval combined with re-ranking on the training dataset.

In [12]:
#result
evaluation(Config.my_prediction_path, Config.truth_path)

insurance: 0.8400
finance: 0.8400
faq: 0.9600
total: 0.88


Evaluate the performance of baseline code.

In [13]:
#baseline
evaluation(Config.prediction_path, Config.truth_path)

insurance: 0.8000
finance: 0.4400
faq: 0.9000
total: 0.7133333333333334
