In [1]:
import os
import pandas as pd
import numpy as np
from dynaconf import Dynaconf

from elqm import ELQMPipeline
from elqm.eval import Evaluation
from elqm.utils import get_configs_dir, get_data_dir

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = Dynaconf(settings_files=os.path.join(get_configs_dir(), 'dev.yaml'))

In [3]:
elqm = ELQMPipeline(config)

Cache key: dev
No cache found for dev


Loading documents: 100%|██████████| 508/508 [00:00<00:00, 7043.66it/s]
Removing HTML tags: 100%|██████████| 508/508 [00:10<00:00, 47.11it/s]
Saving documents: 100%|██████████| 508/508 [00:00<00:00, 3227.06it/s]
100%|██████████| 508/508 [00:01<00:00, 390.06it/s]


Created Document Loader DirectoryLoader
Loaded 508 documents
Created Splitter RecursiveCharacterTextSplitter
Split into 18322 chunks
bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522
Created Embedding GPT4AllEmbeddings
Index file:  /home/psaegert/Projects/elqm-INLPT-WS2023/cache/dev/FAISS/full_prototype_pipeline/index.faiss


Creating FAISS vectorstores: 100%|██████████| 100/100 [06:41<00:00,  4.01s/it]


Created Retriever VectorStoreRetriever


In [4]:
oracle_data = pd.read_csv(os.path.join(get_data_dir("elqm-raw/oracle"), "random_100_20240102.csv"))
oracle_data

Unnamed: 0,source,type,question,answer
0,32015R1188,confirmation,Is the product equipped with an automatically ...,Yes.
1,32015R1188,confirmation,Does the product have a fan-assisted heat outp...,No.
2,32015R1188,confirmation,Can the product's heat output be regulated by ...,Yes.
3,32015R1188,confirmation,Is the product equipped with a non-electronic ...,No.
4,32015R1188,confirmation,Does the product have a day timer or week time...,Yes.
...,...,...,...,...
3013,32016R1719,default,What is the timeline for developing necessary ...,No later than six months after the request by ...
3014,32016R1719,default,Can long-term transmission rights be issued by...,"Yes, upon a joint request of the TSOs on a bid..."
3015,32016R1719,default,What are the different forms in which long-ter...,Long-term cross-zonal capacity shall be alloca...
3016,32016R1719,default,What is the timeline for developing a proposal...,No later than six months after the coordinated...


In [5]:
eval = Evaluation()

In [6]:
results = eval.evaluate(elqm, oracle_data[:100])

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 100/100 [03:19<00:00,  1.99s/it]


In [7]:
# Flatten the dicts
results['answer_bleu_bleu'] = [r['bleu'] for r in results['bleu']]
results['answer_bleu_precision'] = [np.mean(r['precisions']) for r in results['bleu']]
results['answer_bleu_brevity_penalty'] = [r['brevity_penalty'] for r in results['bleu']]
results['answer_length_ratio'] = [r['length_ratio'] for r in results['bleu']]
results['answer_translation_length'] = [r['translation_length'] for r in results['bleu']]
results['answer_reference_length'] = [r['reference_length'] for r in results['bleu']]
del results['bleu']

results['answer_rouge_rouge1'] = [r['rouge1'] for r in results['rouge']]
results['answer_rouge_rouge2'] = [r['rouge2'] for r in results['rouge']]
results['answer_rouge_rougeL'] = [r['rougeL'] for r in results['rouge']]
results['answer_rougeLsum'] = [r['rougeLsum'] for r in results['rouge']]
del results['rouge']

results['answer_bertscore_precision'] = [r['precision'][0] for r in results['bertscore']]
results['answer_bertscore_recall'] = [r['recall'][0] for r in results['bertscore']]
results['answer_bertscore_f1'] = [r['f1'][0] for r in results['bertscore']]
del results['bertscore']

In [8]:
pd.DataFrame(results).mean()

retriever_recall                 0.720000
retriever_precision              0.144000
retriever_f1                     0.240000
answer_bleu_bleu                 0.053291
answer_bleu_precision            0.063143
answer_bleu_brevity_penalty      1.000000
answer_length_ratio             19.196266
answer_translation_length      161.950000
answer_reference_length         21.260000
answer_rouge_rouge1              0.164645
answer_rouge_rouge2              0.108550
answer_rouge_rougeL              0.143270
answer_rougeLsum                 0.142197
answer_bertscore_precision       0.819107
answer_bertscore_recall          0.885989
answer_bertscore_f1              0.850884
dtype: float64