# Validation Pipeline

## Import Dependecies

In [None]:
import random
import evaluate
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

from llama_index.core import Settings
from llama_index.core import StorageContext 
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from tqdm import tqdm

In [None]:
# Set LLM to None
Settings.llm = None

# Set Hugging Face embedding model for LlamaIndex
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2")

## Method(s)

In [3]:
# def calculate_mrr_from_chunks(search_results, full_answer):
#     """
#     MRR kiszámítása a keresőmotor által visszaadott szöveges chunkokból.
    
#     :param search_results: A szemantikus kereső által visszaadott találatok listája (chunkok).
#     :param full_answer: A teljes, helyes válasz (string).
#     :return: MRR érték.
#     """
#     reciprocal_ranks = []
    
#     found_relevant = False
#     for rank, chunk in enumerate(search_results, start=1):
#         # Ellenőrizzük, hogy a chunk tartalmazza-e a teljes választ (vagy annak egy részét).
#         if chunk in full_answer:
#             reciprocal_ranks.append(1 / rank)
#             found_relevant = True
#             break
    
#     # Ha nincs releváns találat, adj hozzá 0-t
#     if not found_relevant:
#         reciprocal_ranks.append(0)

#     # MRR kiszámítása
#     mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
#     return mrr

In [91]:
def map_found(founds, answer):
    results = [0] * len(answer)
    real = [1] * len(answer)

    for i in range(0, len(answer)):
        if answer[i] in founds:
            results[i] = 1

    acc = accuracy_score(real, results)
    f1 = f1_score(real, results, average='weighted')
    recall = recall_score(real, results)
    precision = precision_score(real, results, average='weighted')

    return acc, f1, recall, precision

In [86]:
# def map_found_af(founds, answer):
#     results = [0] * len(founds)
#     real = [1] * len(founds)

#     for i in range(0, len(founds)):
#         if founds[i] in answer:
#             results[i] = 1

#     acc = accuracy_score(real, results)
#     f1 = f1_score(real, results, average='weighted')
#     recall = recall_score(real, results)
#     precision = precision_score(real, results, average='weighted')

#     return acc, f1, recall, precision

In [5]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

## Load Data

In [6]:
dir_data = "../data/"

In [None]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad.info()
medquad.head()

In [116]:
len_mq = int(len(medquad)*0.1)
medquad_sp = medquad.sample(n=len_mq, random_state=42).copy()
medquad_sp = medquad_sp.reset_index(drop=True)
medquad_sp.info()
medquad_sp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1640 entries, 0 to 1639
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  1640 non-null   object
 1   answer    1640 non-null   object
dtypes: object(2)
memory usage: 25.8+ KB


Unnamed: 0,question,answer
0,What are the treatments for dihydropyrimidinas...,These resources address the diagnosis or manag...
1,Who is at risk for Parasites - Cysticercosis? ?,Cysticercosis is an infection caused by the la...
2,What is (are) phenylketonuria ?,Phenylketonuria (commonly known as PKU) is an ...
3,Is Laron syndrome inherited ?,Is Laron syndrome inherited? Most cases of Lar...
4,What are the treatments for globozoospermia ?,These resources address the diagnosis or manag...


## Live

### Sentence Based

In [113]:
storage_context_sb = StorageContext.from_defaults(persist_dir="../data/vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)
query_engine_sb = index_sb.as_query_engine(similarity_top_k=5)

In [None]:
resps = get_response(query_engine_sb.query(medquad["question"].values[0]).response)
map_found(resps, sent_tokenize(medquad["answer"].values[0]))

In [112]:
# f1_metric = evaluate.load("f1")
# results = f1_metric.compute(predictions=["\n". join(resps)], references=[medquad["answer"].values[0]])

# results

In [None]:
print(resps,"\n")

sent_tokenize(medquad["answer"].values[0])

In [None]:
resps = []
for q in tqdm(medquad["question"].values):
    resps.append(get_response(query_engine_sb.query(q).response))

medquad["answer_sb_k1"] = resps

medquad.head()

In [None]:
accs = [0] * len(medquad)
recalls = [0] * len(medquad)

for i in tqdm(range(0,len(medquad))):
    acc, f1, recall, prec = map_found(
        medquad["answer_sb_k1"].values[i], 
        sent_tokenize(medquad["answer"].values[i]))
    
    #print(acc,f1,recall,prec)
    accs[i] = acc
    # f1s[i] = f1
    recalls[i] = recall
    # precisions[i] = prec

medquad["ACC_sb_k1"] = accs
medquad["RECALL_sb_k1"] = recalls

## Demo

In [None]:
question  = medquad["question"].values[10]
real_answer  = medquad["answer"].values[10]
fake_answer = medquad["answer"].values[5] + medquad["answer"].values[20]

In [None]:
fake_answer_sentence = fake_answer.split(".")
fake_answer_sentence = [item.strip() for item in fake_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(fake_answer_sentence)
fake_answer_sentence

In [None]:
real_answer_sentence = real_answer.split(".")
real_answer_sentence = [item.strip() for item in real_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(real_answer_sentence)
real_answer_sentence

In [None]:
demo_found = (real_answer_sentence[:5] + fake_answer_sentence[:3])
random.seed(22)
random.shuffle(demo_found)
demo_found

In [None]:
acc, f1, recall, precision = map_found(demo_found, real_answer)

print("Accuracy scores: ", acc)
print("f1 scores", f1)
print("Recall scores", recall)
print("Precision scores", precision)
print("Mean Reciprocal Rank (MRR): ", calculate_mrr_from_chunks(demo_found, real_answer))