# Validation Pipeline

## Import Dependecies

In [1]:
import random
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

from llama_index.core import Settings, Document
from llama_index.core import StorageContext 
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from tqdm import tqdm

from transformers import AutoTokenizer

import answer_tokenizer as at

import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\solym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Settings.llm = None

# Set Hugging Face embedding model for LlamaIndex
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Hugging Face tokenizer setup
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

LLM is explicitly disabled. Using MockLLM.


## Method(s)

In [3]:
def map_found(founds, answer):
    # Készítsünk egy uniót a két listából, hogy megkapjuk az összes egyedi mondatot
    all_sentences = list(set(founds + answer))

    # Bináris címkézés az átfedésekhez
    y_true = [1 if sentence in founds else 0 for sentence in all_sentences]
    y_pred = [1 if sentence in answer else 0 for sentence in all_sentences]

    # Kiértékelés: accuracy, precision, recall, f1 score
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')

    return accuracy, f1, recall, precision

In [4]:
def get_response(resp):
    return resp.split("\n---------------------\n")[1].split("\n\n")

## Load Data

In [5]:
dir_data = "../data/"

In [6]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad.info()
medquad.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16407 entries, 0 to 16406
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  16407 non-null  object
 1   answer    16407 non-null  object
dtypes: object(2)
memory usage: 256.5+ KB


Unnamed: 0,question,answer
0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...
5,What is (are) Knobloch syndrome ?,Knobloch syndrome is a rare condition characte...
6,How many people are affected by Knobloch syndr...,Knobloch syndrome is a rare condition. However...
7,What are the genetic changes related to Knoblo...,Mutations in the COL18A1 gene can cause Knoblo...
8,Is Knobloch syndrome inherited ?,This condition is inherited in an autosomal re...
9,What are the treatments for Knobloch syndrome ?,These resources address the diagnosis or manag...


In [7]:
medquad = medquad[:1000]
medquad_sp = medquad.copy()
medquad_sp.info()
medquad_sp.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  1000 non-null   object
 1   answer    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


Unnamed: 0,question,answer
0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...
5,What is (are) Knobloch syndrome ?,Knobloch syndrome is a rare condition characte...
6,How many people are affected by Knobloch syndr...,Knobloch syndrome is a rare condition. However...
7,What are the genetic changes related to Knoblo...,Mutations in the COL18A1 gene can cause Knoblo...
8,Is Knobloch syndrome inherited ?,This condition is inherited in an autosomal re...
9,What are the treatments for Knobloch syndrome ?,These resources address the diagnosis or manag...


In [None]:
len_mq = int(len(medquad)*0.1)
medquad_sp = medquad.sample(n=len_mq, random_state=42).copy()
medquad_sp = medquad_sp.reset_index(drop=True)
medquad_sp.info()
medquad_sp.head()

## Live

### Sentence Based

#### Methods

In [8]:
def create_sb_get_answers(df, index, top_k, min_k=1):

    for k in range(min_k, top_k + 1):

        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values, desc=f"(k = {k}) Get responses"):
            resps.append(get_response(query_engine.query(q).response))

        df[f"answer_sb_k{k}"] = resps

def create_tb_get_answers(df, index, size, top_k, min_k=1):

    for k in range(min_k, top_k + 1):

        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values, desc=f"(k = {k}) Get {size} responses"):
            resps.append(get_response(query_engine.query(q).response))

        df[f"answer_tb_k{k}"] = resps
    
    df.to_csv(f"{dir_data}new validation datas/1000_tb_{size}_20k_answers.csv", index=False)

In [9]:
storage_context_tb = StorageContext.from_defaults(persist_dir="../data/vectors/token_based_16")
index_tb = load_index_from_storage(storage_context_tb)

In [10]:
df_tb = pd.read_csv(f"{dir_data}validation datas/medquad_1000_tb_16.csv")
answer_df = df_tb.loc[:, df_tb.columns.str.contains('answer', case=False)]
#answer_df.info()

create_tb_get_answers(answer_df, index_tb, 16, 20, min_k=11)

(k = 11) Get 16 responses: 100%|██████████| 1000/1000 [52:37<00:00,  3.16s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"answer_tb_k{k}"] = resps
(k = 12) Get 16 responses: 100%|██████████| 1000/1000 [52:43<00:00,  3.16s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"answer_tb_k{k}"] = resps
(k = 13) Get 16 responses: 100%|██████████| 1000/1000 [52:41<00:00,  3.16s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pan

In [13]:
df_sb = pd.read_csv(f"{dir_data}validation datas/medquad_1000_sb.csv")

storage_context_sb = StorageContext.from_defaults(persist_dir="../data/vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)

In [16]:
create_sb_get_answers(df_sb, index_sb, top_k=20, min_k=11)
df_sb.to_csv(f"{dir_data}validation datas/medquad_1000_sb_20k.csv", index=False)

                                                                           

In [27]:
valami_df = pd.read_csv(f"{dir_data}validation datas/medquad_1000_sb_20k.csv")
valami_df['answer_sb_k2'].values[1]

"['This mechanism probably underlies the skin, hair, and heart problems that occur in keratoderma with woolly hair.', 'Keratoderma with woolly hair is rare; its prevalence worldwide is unknown.']"

In [None]:
def create_sb_validation(index, tokenized_answers, top_k, min_k=1):
    tqdm_help = min_k

    for k in range(min_k, top_k + 1):
        tqdm_help += 1

        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values, desc=f"(k = {k}) Get responses", leave=False):
            resps.append(get_response(query_engine.query(q).response))

        medquad[f"answer_sb_k{k}"] = resps

        accs = [0] * len(medquad)
        f1s = [0] * len(medquad)
        recalls = [0] * len(medquad)
        precisions = [0] * len(medquad)

        for i in tqdm(range(0,len(medquad)), leave=False):
            acc, f1, recall, prec = map_found(
                medquad[f"answer_sb_k{k}"].values[i], 
                tokenized_answers[i])
            
            #print(acc,f1,recall,prec)
            accs[i] = acc
            f1s[i] = f1
            recalls[i] = recall
            precisions[i] = prec

        medquad[f"ACC_sb_k{k}"] = accs
        medquad[f"F1_sb_k{k}"] = f1s
        medquad[f"RECALL_sb_k{k}"] = recalls
        medquad[f"PRECISION_sb_k{k}"] = precisions

In [None]:
def get_sb_result_df(medquad, top_k):    
    
    results = []

    for k in range(1, top_k + 1):
        # Átlagértékek kiszámítása
        acc_mean = medquad[f"ACC_sb_k{k}"].mean()
        f1_mean = medquad[f"F1_sb_k{k}"].mean()
        recall_mean = medquad[f"RECALL_sb_k{k}"].mean()
        precision_mean = medquad[f"PRECISION_sb_k{k}"].mean()
        # Új sor hozzáadása a results listához
        results.append({
            "k": k,
            "acc": acc_mean,
            "f1": f1_mean,
            "recall": recall_mean,
            "precision": precision_mean
        })

    results_df = pd.DataFrame(results)

    return results_df

In [None]:
storage_context_sb = StorageContext.from_defaults(persist_dir="../data/vectors/sentence_based")
index_sb = load_index_from_storage(storage_context_sb)

tokenized_answers = [sent_tokenize(answer) for answer in medquad["answer"].values]

In [None]:
create_sb_validation(index_sb, tokenized_answers, 10)

result_sb_dataframe = get_sb_result_df(medquad, 10)
result_sb_dataframe.info()
result_sb_dataframe.head(10)
result_sb_dataframe.to_csv(f"{dir_data}validation datas/1000/sb_scores.csv", index=False)

### Token size Based

In [None]:
medquad.info()

In [None]:
def get_tb_result_df(medquad, top_k, size):    
    
    results = []

    for k in range(1, top_k + 1):
        # Átlagértékek kiszámítása
        # median??
        acc_mean = medquad[f"ACC_tb_{size}_k{k}"].mean()
        f1_mean = medquad[f"F1_tb_{size}_k{k}"].mean()
        recall_mean = medquad[f"RECALL_tb_{size}_k{k}"].mean()
        precision_mean = medquad[f"PRECISION_tb_{size}_k{k}"].mean()
        # Új sor hozzáadása a results listához
        results.append({
            "k": k,
            "acc": acc_mean,
            "f1": f1_mean,
            "recall": recall_mean,
            "precision": precision_mean
        })

    results_df = pd.DataFrame(results)

    return results_df

In [None]:
def create_tb_validation(index, top_k, size, min_k=1):
    for k in range(min_k, top_k+1):
        query_engine = index.as_query_engine(similarity_top_k=k)

        resps = []
        for q in tqdm(medquad["question"].values, desc=f"({size} token, k = {k}) Get responses", leave=False):
            resps.append(get_response(query_engine.query(q).response))

        medquad[f"answer_tb_{size}_k{k}"] = resps

        accs = [0] * len(medquad)
        f1s = [0] * len(medquad)
        recalls = [0] * len(medquad)
        precisions = [0] * len(medquad)

        for i in tqdm(range(0,len(medquad))):
            acc, f1, recall, prec = map_found(
                medquad[f"answer_tb_{size}_k{k}"].values[i], 
                at.create_fixed_length_chunks_with_tokenizer(medquad["answer"].values[i], chunk_size=size)
            )
            
            #print(acc,f1,recall,prec)
            accs[i] = acc
            f1s[i] = f1
            recalls[i] = recall
            precisions[i] = prec

        medquad[f"ACC_tb_{size}_k{k}"] = accs
        medquad[f"F1_tb_{size}_k{k}"] = f1s
        medquad[f"RECALL_tb_{size}_k{k}"] = recalls
        medquad[f"PRECISION_tb_{size}_k{k}"] = precisions
    
    tb_result_df = get_tb_result_df(medquad, top_k, size)
    tb_result_df.to_csv(f"{dir_data}validation datas/1000/tb_{size}_scores.csv", index=False)
        # ide mentést implementálni csv formátumban

In [None]:
token_size_list = [512]

for size in token_size_list:  
    storage_context_tb = StorageContext.from_defaults(persist_dir=f"../data/vectors/token_based_{size}")
    index_tb = load_index_from_storage(storage_context_tb)

    create_tb_validation(index_tb, 9, size)

In [None]:
tb_result_df = get_tb_result_df(medquad, 9, 512)
tb_result_df.to_csv(f"{dir_data}validation datas/1000/tb_512_scores.csv", index=False)

In [None]:
medquad.to_csv(f"{dir_data}validation datas/medquad_1000_sb.csv", index=False)

valami = pd.read_csv(f"{dir_data}validation datas/medquad_1000_sb.csv")
valami.head()

In [None]:
medquad.head(10)

## Demo

In [None]:
question  = medquad["question"].values[10]
real_answer  = medquad["answer"].values[10]
fake_answer = medquad["answer"].values[5] + medquad["answer"].values[20]

In [None]:
fake_answer_sentence = fake_answer.split(".")
fake_answer_sentence = [item.strip() for item in fake_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(fake_answer_sentence)
fake_answer_sentence

In [None]:
real_answer_sentence = real_answer.split(".")
real_answer_sentence = [item.strip() for item in real_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(real_answer_sentence)
real_answer_sentence

In [None]:
demo_found = (real_answer_sentence[:5] + fake_answer_sentence[:3])
random.seed(22)
random.shuffle(demo_found)
demo_found

In [None]:
acc, f1, recall, precision = map_found(demo_found, real_answer)

print("Accuracy scores: ", acc)
print("f1 scores", f1)
print("Recall scores", recall)
print("Precision scores", precision)
print("Mean Reciprocal Rank (MRR): ", calculate_mrr_from_chunks(demo_found, real_answer))