# Validation Pipeline

## Import Dependecies

In [None]:
import random
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

from llama_index.core import Settings, Document
from llama_index.core import StorageContext 
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from tqdm import tqdm

from transformers import AutoTokenizer

import answer_tokenizer as at

In [None]:
Settings.llm = None

# Set Hugging Face embedding model for LlamaIndex
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Hugging Face tokenizer setup
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

## Method(s)

In [3]:
def map_found(founds, answer):
    # Készítsünk egy uniót a két listából, hogy megkapjuk az összes egyedi mondatot
    all_sentences = list(set(founds + answer))

    # Bináris címkézés az átfedésekhez
    y_true = [1 if sentence in founds else 0 for sentence in all_sentences]
    y_pred = [1 if sentence in answer else 0 for sentence in all_sentences]

    # Kiértékelés: accuracy, precision, recall, f1 score
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')

    return accuracy, f1, recall, precision

In [4]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

## Load Data

In [5]:
dir_data = "../data/"

In [None]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad.info()
medquad.head(10)

In [None]:
medquad = medquad[:25]
medquad_sp = medquad.copy()
medquad_sp.info()
medquad_sp.head(10)

In [None]:
len_mq = int(len(medquad)*0.1)
medquad_sp = medquad.sample(n=len_mq, random_state=42).copy()
medquad_sp = medquad_sp.reset_index(drop=True)
medquad_sp.info()
medquad_sp.head()

## Live

### Sentence Based

#### Methods

In [37]:
def create_sb_validation(index, top_k):
    for k in range(1, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values):
            resps.append(get_response(query_engine.query(q).response))

        medquad[f"answer_sb_k{k}"] = resps

        accs = [0] * len(medquad)
        f1s = [0] * len(medquad)
        recalls = [0] * len(medquad)
        precisions = [0] * len(medquad)

        for i in tqdm(range(0,len(medquad))):
            acc, f1, recall, prec = map_found(
                medquad[f"answer_sb_k{k}"].values[i], 
                sent_tokenize(medquad["answer"].values[i])
            )
            
            #print(acc,f1,recall,prec)
            accs[i] = acc
            f1s[i] = f1
            recalls[i] = recall
            precisions[i] = prec

        medquad[f"ACC_sb_k{k}"] = accs
        medquad[f"F1_sb_k{k}"] = f1s
        medquad[f"RECALL_sb_k{k}"] = recalls
        medquad[f"PRECISION_sb_k{k}"] = precisions

In [38]:
def get_sb_result_df(medquad, top_k):    
    
    results = []

    for k in range(1, top_k + 1):
        # Átlagértékek kiszámítása
        acc_mean = medquad[f"ACC_sb_k{k}"].mean()
        f1_mean = medquad[f"F1_sb_k{k}"].mean()
        recall_mean = medquad[f"RECALL_sb_k{k}"].mean()
        precision_mean = medquad[f"PRECISION_sb_k{k}"].mean()
        # Új sor hozzáadása a results listához
        results.append({
            "k": k,
            "acc": acc_mean,
            "f1": f1_mean,
            "recall": recall_mean,
            "precision": precision_mean
        })

    results_df = pd.DataFrame(results)

    return results_df

In [None]:
storage_context_sb = StorageContext.from_defaults(persist_dir="../data/vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)

create_sb_validation(index_sb, 10)

In [None]:
result_sb_dataframe = get_sb_result_df(medquad, 10)
result_sb_dataframe.info()
result_sb_dataframe.head(10)

In [26]:
storage_context_sb = StorageContext.from_defaults(persist_dir="../data/vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)

In [None]:
query_engine_sb = index_sb.as_query_engine(similarity_top_k=5)
resps = get_response(query_engine_sb.query(medquad["question"].values[0]).response)
map_found(resps, sent_tokenize(medquad["answer"].values[0]))

### Token size Based

In [8]:
def create_tb_validation(index, top_k, size):
    for k in range(1, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values):
            resps.append(get_response(query_engine.query(q).response))

        medquad[f"answer_tb_{size}_k{k}"] = resps

        accs = [0] * len(medquad)
        f1s = [0] * len(medquad)
        recalls = [0] * len(medquad)
        precisions = [0] * len(medquad)

        for i in tqdm(range(0,len(medquad))):
            acc, f1, recall, prec = map_found(
                medquad[f"answer_tb_{size}_k{k}"].values[i], 
                at.create_fixed_length_chunks_with_tokenizer(medquad["answer"].values[i])
            )
            
            #print(acc,f1,recall,prec)
            accs[i] = acc
            f1s[i] = f1
            recalls[i] = recall
            precisions[i] = prec

        medquad[f"ACC_tb_{size}_k{k}"] = accs
        medquad[f"F1_tb_{size}_k{k}"] = f1s
        medquad[f"RECALL_tb_{size}_k{k}"] = recalls
        medquad[f"PRECISION_tb_{size}_k{k}"] = precisions

In [9]:
def get_tb_result_df(medquad, top_k, size):    
    
    results = []

    for k in range(1, top_k + 1):
        # Átlagértékek kiszámítása
        acc_mean = medquad[f"ACC_tb_{size}_k{k}"].mean()
        f1_mean = medquad[f"F1_tb_{size}_k{k}"].mean()
        recall_mean = medquad[f"RECALL_tb_{size}_k{k}"].mean()
        precision_mean = medquad[f"PRECISION_tb_{size}_k{k}"].mean()
        # Új sor hozzáadása a results listához
        results.append({
            "k": k,
            "acc": acc_mean,
            "f1": f1_mean,
            "recall": recall_mean,
            "precision": precision_mean
        })

    results_df = pd.DataFrame(results)

    return results_df

In [17]:
token_size_list = [512, 256, 128, 64, 32, 16]

i = 0
for size in tqdm(token_size_list, desc=f"Token size: {token_size_list[i]}"):    
    storage_context_tb = StorageContext.from_defaults(persist_dir=f"../data/vectors/token_based_{size}")
    index_tb = load_index_from_storage(storage_context_tb)

    create_tb_validation(index_tb, 10, size)
    i+=1

100%|██████████| 25/25 [00:00<00:00, 50.48it/s] ?it/s]
100%|██████████| 25/25 [00:04<00:00,  5.92it/s]
100%|██████████| 25/25 [00:00<00:00, 49.95it/s]
100%|██████████| 25/25 [00:04<00:00,  5.74it/s]
100%|██████████| 25/25 [00:00<00:00, 49.68it/s]
100%|██████████| 25/25 [00:04<00:00,  6.02it/s]
100%|██████████| 25/25 [00:00<00:00, 49.82it/s]
100%|██████████| 25/25 [00:04<00:00,  6.12it/s]
100%|██████████| 25/25 [00:00<00:00, 49.38it/s]
100%|██████████| 25/25 [00:04<00:00,  5.78it/s]
100%|██████████| 25/25 [00:00<00:00, 48.42it/s]
100%|██████████| 25/25 [00:04<00:00,  6.02it/s]
100%|██████████| 25/25 [00:00<00:00, 46.79it/s]
100%|██████████| 25/25 [00:04<00:00,  5.88it/s]
100%|██████████| 25/25 [00:00<00:00, 47.11it/s]
100%|██████████| 25/25 [00:04<00:00,  6.00it/s]
100%|██████████| 25/25 [00:00<00:00, 47.06it/s]
100%|██████████| 25/25 [00:04<00:00,  6.07it/s]
100%|██████████| 25/25 [00:00<00:00, 45.80it/s]
100%|██████████| 25/25 [00:04<00:00,  6.07it/s]
  medquad[f"PRECISION_tb_{size}_k

In [25]:
medquad.info()

get_tb_result_df(medquad, 10, 16)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Columns: 302 entries, question to PRECISION_tb_16_k10
dtypes: float64(240), object(62)
memory usage: 59.1+ KB


Unnamed: 0,k,acc,f1,recall,precision
0,1,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0
5,6,0.0,0.0,0.0,0.0
6,7,0.0,0.0,0.0,0.0
7,8,0.0,0.0,0.0,0.0
8,9,0.0,0.0,0.0,0.0
9,10,0.0,0.0,0.0,0.0


## Demo

In [None]:
question  = medquad["question"].values[10]
real_answer  = medquad["answer"].values[10]
fake_answer = medquad["answer"].values[5] + medquad["answer"].values[20]

In [None]:
fake_answer_sentence = fake_answer.split(".")
fake_answer_sentence = [item.strip() for item in fake_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(fake_answer_sentence)
fake_answer_sentence

In [None]:
real_answer_sentence = real_answer.split(".")
real_answer_sentence = [item.strip() for item in real_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(real_answer_sentence)
real_answer_sentence

In [None]:
demo_found = (real_answer_sentence[:5] + fake_answer_sentence[:3])
random.seed(22)
random.shuffle(demo_found)
demo_found

In [None]:
acc, f1, recall, precision = map_found(demo_found, real_answer)

print("Accuracy scores: ", acc)
print("f1 scores", f1)
print("Recall scores", recall)
print("Precision scores", precision)
print("Mean Reciprocal Rank (MRR): ", calculate_mrr_from_chunks(demo_found, real_answer))