# Validation Pipeline

## Import Dependecies

In [22]:
import random
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

## Method(s)

In [51]:
def calculate_mrr_from_chunks(search_results, full_answer):
    """
    MRR kiszámítása a keresőmotor által visszaadott szöveges chunkokból.
    
    :param search_results: A szemantikus kereső által visszaadott találatok listája (chunkok).
    :param full_answer: A teljes, helyes válasz (string).
    :return: MRR érték.
    """
    reciprocal_ranks = []
    
    found_relevant = False
    for rank, chunk in enumerate(search_results, start=1):
        # Ellenőrizzük, hogy a chunk tartalmazza-e a teljes választ (vagy annak egy részét).
        if chunk in full_answer:
            reciprocal_ranks.append(1 / rank)
            found_relevant = True
            break
    
    # Ha nincs releváns találat, adj hozzá 0-t
    if not found_relevant:
        reciprocal_ranks.append(0)

    # MRR kiszámítása
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    return mrr

In [None]:
def map_found(founds, answer):
    results = []
    real = [1] * len(founds)

    for item in founds:
        if item in answer:
            results.append(1)
        else:
            results.append(0)

    acc = accuracy_score(real, results)
    f1 = f1_score(real, results, average='weighted')
    recall = recall_score(real, results)
    precision = precision_score(real, results, average='weighted')

    return acc, f1, recall, precision

## Load Data

In [2]:
dir_data = "../data/"

In [3]:
medquad = pd.read_json(f"{dir_data}validations/mqdquad.json", orient="records")
medquad.info()
medquad.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16407 entries, 0 to 16406
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  16407 non-null  object
 1   answer    16407 non-null  object
dtypes: object(2)
memory usage: 256.5+ KB


Unnamed: 0,question,answer
0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...


## Live

In [None]:
# To Do
# We have to check all questions and all answares from the test dataset.

## Demo

In [10]:
question  = medquad["question"].values[10]
real_answer  = medquad["answer"].values[10]
fake_answer = medquad["answer"].values[5] + medquad["answer"].values[20]

In [13]:
fake_answer_sentence = fake_answer.split(".")
fake_answer_sentence = [item.strip() for item in fake_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(fake_answer_sentence)
fake_answer_sentence

['Restless leg syndrome is a condition characterized by numbness or tingling in the legs accompanied by an urge to move the legs to stop the sensations',
 'Most affected individuals have vitreoretinal degeneration, which is breakdown (degeneration) of two structures in the eye called the vitreous and the retina',
 'Another characteristic feature of Knobloch syndrome is a skull defect called an occipital encephalocele, which is a sac-like protrusion of the brain (encephalocele) through a defect in the bone at the base of the skull (occipital bone)',
 'REM sleep behavior disorder is a condition in which the muscles are active during the dream (REM) stage of sleep, so an affected person often acts out his or her dreams',
 'In other conditions, encephaloceles may be associated with intellectual disability; however, most people with Knobloch syndrome have normal intelligence',
 'People with SCA3 eventually require wheelchair assistance',
 'Spinocerebellar ataxia type 3 (SCA3) is a condition

In [16]:
real_answer_sentence = real_answer.split(".")
real_answer_sentence = [item.strip() for item in real_answer_sentence if len(item) > 2]
random.seed(10)
random.shuffle(real_answer_sentence)
real_answer_sentence

['Large retinal colobomas or those affecting the optic nerve can cause low vision, which means vision loss that cannot be completely corrected with glasses or contact lenses',
 'Colobomas involving the eyeball should be distinguished from gaps that occur in the eyelids',
 'Colobomas involving the retina result in vision loss in specific parts of the visual field, generally the upper part',
 'Such severe microphthalmia should be distinguished from another condition called anophthalmia, in which no eyeball forms at all',
 'In this condition, one or both eyeballs are abnormally small',
 'Microphthalmia may or may not result in significant vision loss',
 'Some individuals have coloboma as part of a syndrome that affects other organs and tissues in the body',
 'When coloboma occurs by itself, it is described as nonsyndromic or isolated',
 'They may appear as notches or gaps in one of several parts of the eye, including the colored part of the eye called the iris; the retina, which is the sp

In [47]:
demo_found = (real_answer_sentence[:5] + fake_answer_sentence[:3])
random.seed(22)
random.shuffle(demo_found)
demo_found

['Most affected individuals have vitreoretinal degeneration, which is breakdown (degeneration) of two structures in the eye called the vitreous and the retina',
 'Another characteristic feature of Knobloch syndrome is a skull defect called an occipital encephalocele, which is a sac-like protrusion of the brain (encephalocele) through a defect in the bone at the base of the skull (occipital bone)',
 'Restless leg syndrome is a condition characterized by numbness or tingling in the legs accompanied by an urge to move the legs to stop the sensations',
 'Such severe microphthalmia should be distinguished from another condition called anophthalmia, in which no eyeball forms at all',
 'In this condition, one or both eyeballs are abnormally small',
 'Large retinal colobomas or those affecting the optic nerve can cause low vision, which means vision loss that cannot be completely corrected with glasses or contact lenses',
 'Colobomas involving the eyeball should be distinguished from gaps that

In [55]:
acc, f1, recall, precision = map_found(demo_found, real_answer)

print("Accuracy scores: ", acc)
print("f1 scores", f1)
print("Recall scores", recall)
print("Precision scores", precision)
print("Mean Reciprocal Rank (MRR): ", calculate_mrr_from_chunks(demo_found, real_answer))

Accuracy scores:  0.625
f1 scores 0.7692307692307693
Recall scores 0.625
Precision scores 1.0
Mean Reciprocal Rank (MRR):  0.25
