# Results
In this notebook, we define and calculate the statistics of our tests.

In [54]:
# load data

import json
import pandas as pd
from collections import defaultdict
import math
import os

def loadData(fileName):
    with open(f'{fileName}.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
        return data

def normalize(cases):
    for case in cases.values():
        maxLen = max(len(result) for result in case.values())
        for result in case.values():
            result += [None]*(maxLen-len(result))

        assert len(set(len(result) for result in case.values())) == 1
    
    return cases

def getDataframe(case):
    return pd.DataFrame(
        case,
        columns=['gs',*sorted([key for key in case.keys() if key != 'gs'])],
        index= list(range(1,1+len(list(case.values())[0])))
        )


In [55]:
# We need to make all the differentials of the same length to ease comparison
# We pad the lists with None
# data = loadData('allResults')
data = loadData('aveyAll')
for caseNum, tests in data.items():
    assert len(tests.keys()) == 10, f"app missing in case {caseNum}, {tests.keys()}"
data = {k:v for k,v in data.items() if k in loadData('maram')}
normalizedData = normalize(data)
cases = {int(id): getDataframe(case) for id, case in normalizedData.items()}
caseClassification = loadData('case-classification')
f'We have {len(cases)} cases in the experiment.'


'We have 39 cases in the experiment.'

Let us have a look at all the cases.

In [56]:
from IPython.display import display
for caseNum, case in cases.items():
    print(f"Case number {caseNum}")
    display(case)
    break

Case number 156


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,Tala Hammouri,WebMD
1,malaria,tick borne relapsing fever,mixed connective tissue disease,,encephalitis,upper respiratory infection,malaria,gastroenteritis,malaria,influenza
2,pneumonia,malaria,brucellosis,,coronavirus disease 2019,influenza,,,upper respiratory tract infection,pneumococcal meningitis
3,encephalitis,leptospirosis,influenza,,Influenza or flu-like illnes,pneumonia,,,,pneumonia
4,,influenza,systemic lupus erythromatosus,,,,,,,septicemia
5,,coronavirus disease 2019,dengue fever,,,,,,,swine influenza
6,,,,,,,,,,acute appendicitis
7,,,,,,,,,,acute sinusitis


## Let us define the metrics now.

### Terms used
- TP: True positive (correct disease retrieved)
- TN: True negative (wrong disease **not** retrieved)
- FP: False positive (wrong disease retrieved)
- FN: False negative (correct disease **not** retrieved)
- gold standard - the correct list of diseases as determined by collective intelligence of doctors

### Precision
Precision helps us understand how exact our results are. It gives us an intuition about how many wrong diseases (false positives) are being retrieved. It is the ratio *number of correct diseases retrieved* to the *length of the complete list retrieved*.
$$precision = \frac{TP}{TP + FP} = \frac{TP}{\text{length of differential list}}$$

### Recall
Recall is a measure of how many of the correct diseases are being retrieved. It is the ratio *number of correct diseases retrieved* to the *length of the gold standard list*.
$$recall = \frac{TP}{TP + FN} = \frac{TP}{\text{length of the gold standard}}$$

### F1 Score
F1 score is the weighted harmonic mean of *precision* and *recall*. It is a metric that combines *precision* and *recall* and gives us 1 score for easier comparison.

Suppose $\beta$ defines how important is $recall$ to $precision$ then,
$$fscore_{\beta} = (1 + \beta^2)\frac{precision \cdot recall}{(\beta^2 \cdot precision) + recall}$$
Substituting $\beta = 1$,
$$fscore_{1} = \frac{2 \cdot precision \cdot recall}{ precision + recall}$$

### NDCG
NDCG or Normalized Discounted Cumulative Gain is measure of how accurate the ranking is. In our calculations, we use
$$DCG = \sum_{i=1}^n\frac{2^{relevance_i}-1}{log_2(i+1)}$$
where $n$ is the number of differentials in the returned list and  
$relevance_i = |gold standard| - rank_{gold\ standard}(ddx[i])$ if $ddx[i]$ is present, 0 otherwise.

$$NDCG = \frac{DCG_{ddx}}{DCG_{gold\ standard}}$$

### M Score
M Score determines where the top disease (gold standard) appears in the returned differential.
$$M_i = \text{gold standard[0]} \in \text{ddx[:i]}$$

### Position
Shows the position of the gold standard[0] in the returned differential. 

### Length
$$length = \frac{|ddx|}{|gold\ standard|}$$


In [57]:
import math

beta = 1

def getPrecision(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp if tp == 0 else tp/candidate.count()


def getRecall(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/goldStandard.count()


def getF1Score(precision: float, recall: float, beta: float = 1) -> float:
    return math.nan if precision+recall == 0 else \
        (1+beta**2)*precision*recall/(precision*(beta**2)+recall)


def getNDCG(goldStandard: pd.Series, candidate: pd.Series, scores) -> float:
    def discount(score: float, index: int) -> float:
        return (math.pow(2, score)-1)/math.log2(index+1)

    maxDCG = sum(discount(scores[i], i+1) for i in range(len(scores)))

    candidateRelevance = []
    for index, disease in enumerate(candidate):
        goldStandard = list(goldStandard)
        if disease is not None and disease in goldStandard:
            candidateRelevance.append(
                discount(scores[goldStandard.index(disease)], index+1))
        else:
            candidateRelevance.append(0)

    return sum(candidateRelevance)/maxDCG


def getMScore(goldStandard: pd.Series, candidate: pd.Series, m=1) -> bool:
    return goldStandard.values[0] in candidate.values[:m]


def getPosition(goldStandard: pd.Series, candidate: pd.Series) -> bool:
    return math.nan if goldStandard.values[0] not in candidate.values else\
        1 + list(candidate.values).index(goldStandard.values[0])


def getLength(goldStandard: pd.Series, candidate: pd.Series) -> int:
    return math.nan if candidate.count() == 0 else \
        candidate.count()/goldStandard.count()


def getScoresCase(case: pd.DataFrame, beta:float = 1) -> pd.DataFrame:
    scores = [
        [getPrecision(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))],
        [getRecall(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))]
    ]

    scores.append([getF1Score(scores[0][i], scores[1][i],beta=beta)
                  for i in range(len(case.columns)-1)])
    scores.append([getNDCG(case.iloc[:, 0], case.iloc[:, i],
                           list(range(case.iloc[:, 0].count(), 0, -1)))
                   for i in range(1, len(case.columns))])

    for m in range(1, 6, 2):
        scores.append([getMScore(case.iloc[:, 0], case.iloc[:, i], m)
                      for i in range(1, len(case.columns))])

    scores.append([getPosition(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    scores.append([getLength(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    return pd.DataFrame(scores, columns=case.columns[1:],
                        index=["precision", "recall", "f1-score", "NDCG",
                               "M1", "M3", "M5", "position", "length (x of gs)"],
                        )

scores = {id:getScoresCase(case) for id, case in cases.items()}

Let us define the experiment now. We will pick which cases to compute startistics for.

In [58]:
from collections import defaultdict
experiments = {}
combineLabels = defaultdict(list)
def addExperiment(caseType,casesToConsider):
    for app, classifications in caseClassification['apps'].items():
        combineLabels[f'ignore_{caseType}_none'].append(f'ignore_{caseType}_{app}_none')
        experiments[f'ignore_{caseType}_{app}_none'] = set(casesToConsider)
        for classification, cases in classifications.items():
            combineLabels[f'ignore_{caseType}_{classification}'].append(f'ignore_{caseType}_{app}_{classification}')
            experiments[f'ignore_{caseType}_{app}_{classification}'] = set(casesToConsider) - set(cases)
            if f'ignore_{caseType}_any_{classification}' in experiments:
                experiments[f'ignore_{caseType}_any_{classification}'] =\
                 experiments[f'ignore_{caseType}_any_{classification}'] - set(cases)
            else:
                experiments[f'ignore_{caseType}_any_{classification}'] = set(casesToConsider) - set(cases)

addExperiment('common',caseClassification['common'])
addExperiment('less common',caseClassification['less common'])
addExperiment('all',set(caseClassification['less common']) | set(caseClassification['common']))
    

list(experiments.keys())


['ignore_common_Ada_none',
 'ignore_common_Ada_session_failed',
 'ignore_common_any_session_failed',
 'ignore_common_Ada_no_disease_found',
 'ignore_common_any_no_disease_found',
 'ignore_common_Avey_none',
 'ignore_common_Avey_session_failed',
 'ignore_common_Avey_no_disease_found',
 'ignore_common_Babylon_none',
 'ignore_common_Babylon_session_failed',
 'ignore_common_Babylon_no_disease_found',
 'ignore_common_Buoy_none',
 'ignore_common_Buoy_session_failed',
 'ignore_common_Buoy_no_disease_found',
 'ignore_common_K health_none',
 'ignore_common_K health_session_failed',
 'ignore_common_K health_no_disease_found',
 'ignore_common_WebMD_none',
 'ignore_common_WebMD_session_failed',
 'ignore_common_WebMD_no_disease_found',
 'ignore_less common_Ada_none',
 'ignore_less common_Ada_session_failed',
 'ignore_less common_any_session_failed',
 'ignore_less common_Ada_no_disease_found',
 'ignore_less common_any_no_disease_found',
 'ignore_less common_Avey_none',
 'ignore_less common_Avey_sess

In [59]:
def getAverage(scores, row: int, col: int) -> float:
    values = []
    for score in scores.values():
        if not math.isnan(score.iloc[row, col]):
            values.append(score.iloc[row, col])

    return 0 if not values else round(sum(values)/len(values), 3)

results = {}
for label, casesToConsider in experiments.items():
    selectedScores = {id:score for id,score in scores.items() if int(id) in casesToConsider}
    averageScores = pd.DataFrame(
        [
            [
                getAverage(selectedScores, row, col)
                for col in range(len(next(iter(scores.values())).columns))
            ]
            for row in range(9)
        ],
        columns=next(iter(scores.values())).columns,
        index=[
            f"average_{x}" for x in [
                "precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5", "position", "length (x of gs)"
            ]
        ]
    )

    # ignore nan for recall
    for col in next(iter(scores.values())).columns:
        p = averageScores.loc["average_precision", col]
        r = averageScores.loc["average_recall", col]
        averageScores.loc["average_f1-score",
                        col] = round(getF1Score(p, r, beta), 3)

    doctorResults = averageScores.loc[:, "Mohmmad Almadani"] +\
        averageScores.loc[:, "Noor Joudeh"] +\
        averageScores.loc[:,
                            "Tala Hammouri"]
    doctorResults /= 3.0
    # print("average_doctor" in averageScores.columns)
    averageScores.insert(
        loc=7, column="average_doctor",
        value=doctorResults.round(3),
    )

    results[label] = averageScores


Let us print all the results. The experiments are nomenclatured as follows:
- **[common|uncommon|all]:** means whether only common cases were considered, only uncommon cases were considered, and so on.
- **app name / any:** If an app name is present, then we ignore only those cases that pertain to it. If the label is *any* then we consider all apps in that experiment.
- **failure type**: The apps can fail in 2 ways. Either a session does not complete due to some reason or the app fails to retrieve any diagnosis. If this is set to None, then we ignore the failures and consider all cases under option 1 above.

In [60]:
def displayResults(results,printNumCases=True):
    for label, result in results.items():
        result.to_csv(f'stats/{label}.json',sep=';')
        if printNumCases:
            print(f'Results for experiment {label}, which has {len(set(experiments[label]) & set(scores.keys()))} cases, is')
        else:
            print(f'Results for experiment {label} is')
        display(result)

displayResults({key:val for key, val in results.items() if 'any' not in key})

Results for experiment ignore_common_Ada_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Ada_session_failed, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Ada_no_disease_found, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Avey_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Avey_session_failed, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Avey_no_disease_found, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Babylon_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Babylon_session_failed, which has 31 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.534,0.37,0.035,0.323,0.475,0.585,0.844,0.738,0.784,0.281
average_recall,0.695,0.51,0.039,0.3,0.47,0.494,0.358,0.468,0.553,0.615
average_f1-score,0.604,0.429,0.037,0.311,0.472,0.536,0.503,0.563,0.649,0.386
average_NDCG,0.759,0.543,0.04,0.381,0.604,0.602,0.609,0.668,0.794,0.544
average_M1,0.613,0.387,0.032,0.355,0.581,0.548,0.677,0.688,0.839,0.29
average_M3,0.903,0.613,0.032,0.516,0.71,0.742,0.71,0.817,1.0,0.548
average_M5,0.935,0.677,0.065,0.516,0.742,0.742,0.71,0.817,1.0,0.71
average_position,1.448,1.762,2.5,1.5,1.391,1.304,1.045,1.17,1.161,2.542
average_length (x of gs),1.502,1.685,0.967,0.996,1.143,1.013,0.478,0.76,0.788,2.552


Results for experiment ignore_common_Babylon_no_disease_found, which has 3 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.889,0.5,0.361,0.167,0.5,0.389,1.0,0.722,0.778,0.143
average_recall,0.689,0.4,0.4,0.067,0.244,0.356,0.422,0.43,0.511,0.2
average_f1-score,0.776,0.444,0.38,0.096,0.328,0.372,0.594,0.528,0.617,0.167
average_NDCG,0.757,0.662,0.418,0.226,0.507,0.181,0.81,0.616,0.858,0.203
average_M1,0.667,0.667,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M3,1.0,1.0,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M5,1.0,1.0,0.667,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_position,1.333,1.333,2.5,1.0,1.0,0.0,1.0,0.667,1.0,7.0
average_length (x of gs),0.8,1.178,0.967,0.578,0.533,0.933,0.422,0.666,0.644,1.467


Results for experiment ignore_common_Buoy_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_Buoy_session_failed, which has 29 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.551,0.355,0.037,0.345,0.456,0.574,0.868,0.737,0.77,0.279
average_recall,0.691,0.516,0.041,0.321,0.433,0.494,0.366,0.476,0.568,0.594
average_f1-score,0.613,0.421,0.039,0.333,0.444,0.531,0.515,0.567,0.654,0.38
average_NDCG,0.749,0.567,0.043,0.408,0.577,0.586,0.622,0.669,0.799,0.518
average_M1,0.586,0.414,0.034,0.379,0.552,0.517,0.69,0.678,0.828,0.276
average_M3,0.897,0.655,0.034,0.552,0.69,0.724,0.724,0.816,1.0,0.517
average_M5,0.931,0.724,0.069,0.552,0.724,0.724,0.724,0.816,1.0,0.69
average_position,1.481,1.762,2.5,1.5,1.429,1.333,1.048,1.184,1.172,2.476
average_length (x of gs),1.456,1.767,0.967,0.967,1.114,1.031,0.477,0.776,0.82,2.446


Results for experiment ignore_common_Buoy_no_disease_found, which has 26 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.541,0.39,0.042,0.385,0.465,0.628,0.872,0.764,0.792,0.239
average_recall,0.707,0.523,0.046,0.358,0.469,0.524,0.385,0.479,0.529,0.565
average_f1-score,0.613,0.447,0.044,0.371,0.467,0.571,0.534,0.58,0.634,0.336
average_NDCG,0.77,0.541,0.048,0.455,0.608,0.624,0.664,0.696,0.799,0.526
average_M1,0.615,0.423,0.038,0.423,0.615,0.615,0.769,0.756,0.885,0.308
average_M3,0.923,0.615,0.038,0.615,0.731,0.731,0.808,0.846,1.0,0.538
average_M5,0.962,0.654,0.077,0.615,0.731,0.731,0.808,0.846,1.0,0.692
average_position,1.44,1.647,2.5,1.5,1.211,1.158,1.048,1.107,1.115,2.5
average_length (x of gs),1.492,1.648,0.967,0.967,1.166,0.974,0.482,0.733,0.744,2.595


Results for experiment ignore_common_K health_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_K health_session_failed, which has 28 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.554,0.362,0.039,0.315,0.526,0.582,0.827,0.723,0.761,0.289
average_recall,0.716,0.487,0.043,0.285,0.52,0.487,0.355,0.471,0.571,0.609
average_f1-score,0.625,0.415,0.041,0.299,0.523,0.53,0.497,0.56,0.652,0.392
average_NDCG,0.755,0.504,0.045,0.384,0.669,0.577,0.591,0.655,0.796,0.54
average_M1,0.571,0.321,0.036,0.357,0.643,0.5,0.643,0.655,0.821,0.286
average_M3,0.893,0.571,0.036,0.536,0.786,0.714,0.679,0.798,1.0,0.536
average_M5,0.929,0.643,0.071,0.536,0.821,0.714,0.679,0.798,1.0,0.679
average_position,1.5,1.889,2.5,1.533,1.391,1.35,1.053,1.194,1.179,2.571
average_length (x of gs),1.502,1.693,0.967,0.987,1.143,1.026,0.488,0.782,0.831,2.444


Results for experiment ignore_common_K health_no_disease_found, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_WebMD_none, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_WebMD_session_failed, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_common_WebMD_no_disease_found, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.577,0.849,0.739,0.791,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.489,0.357,0.468,0.557,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.529,0.503,0.562,0.654,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.589,0.613,0.667,0.799,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.531,0.688,0.688,0.844,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.719,0.719,0.813,1.0,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.719,0.719,0.813,1.0,0.688
average_position,1.467,1.727,2.5,1.5,1.391,1.304,1.043,1.168,1.156,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,1.013,0.474,0.757,0.784,2.493


Results for experiment ignore_less common_Ada_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Ada_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Ada_no_disease_found, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Avey_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Avey_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Avey_no_disease_found, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Babylon_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Babylon_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Babylon_no_disease_found, which has 0 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_recall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_f1-score,,,,,,,,,,
average_NDCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_position,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_length (x of gs),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Results for experiment ignore_less common_Buoy_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Buoy_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_Buoy_no_disease_found, which has 6 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.183,0.194,0.0,0.167,0.153,0.506,0.417,0.502,0.583,0.183
average_recall,0.278,0.306,0.0,0.167,0.167,0.306,0.194,0.343,0.528,0.389
average_f1-score,0.221,0.237,,0.167,0.16,0.381,0.265,0.4,0.554,0.249
average_NDCG,0.198,0.271,0.0,0.105,0.056,0.387,0.117,0.409,0.723,0.306
average_M1,0.0,0.167,0.0,0.0,0.0,0.333,0.0,0.333,0.667,0.167
average_M3,0.333,0.333,0.0,0.167,0.0,0.5,0.0,0.5,1.0,0.167
average_M5,0.333,0.333,0.0,0.167,0.0,0.5,0.0,0.5,1.0,0.333
average_position,2.0,1.5,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.75,1.889,0.0,1.028,1.139,0.861,0.472,0.768,0.972,2.361


Results for experiment ignore_less common_K health_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_K health_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_K health_no_disease_found, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_WebMD_none, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_WebMD_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_WebMD_no_disease_found, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_all_Ada_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Ada_session_failed, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Ada_no_disease_found, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Avey_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Avey_session_failed, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Avey_no_disease_found, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Babylon_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Babylon_session_failed, which has 38 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.48,0.346,0.029,0.289,0.417,0.583,0.781,0.707,0.758,0.262
average_recall,0.637,0.49,0.032,0.271,0.418,0.46,0.34,0.448,0.543,0.571
average_f1-score,0.547,0.406,0.03,0.28,0.417,0.514,0.474,0.54,0.633,0.359
average_NDCG,0.667,0.502,0.033,0.328,0.505,0.555,0.523,0.614,0.765,0.495
average_M1,0.5,0.342,0.026,0.289,0.474,0.5,0.553,0.614,0.789,0.263
average_M3,0.816,0.579,0.026,0.447,0.579,0.684,0.579,0.746,0.974,0.474
average_M5,0.842,0.632,0.053,0.447,0.605,0.684,0.579,0.746,0.974,0.632
average_position,1.531,1.792,2.5,1.529,1.391,1.308,1.045,1.181,1.189,2.704
average_length (x of gs),1.546,1.726,0.967,1.003,1.157,0.971,0.482,0.753,0.805,2.507


Results for experiment ignore_all_Babylon_no_disease_found, which has 3 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.889,0.5,0.361,0.167,0.5,0.389,1.0,0.722,0.778,0.143
average_recall,0.689,0.4,0.4,0.067,0.244,0.356,0.422,0.43,0.511,0.2
average_f1-score,0.776,0.444,0.38,0.096,0.328,0.372,0.594,0.528,0.617,0.167
average_NDCG,0.757,0.662,0.418,0.226,0.507,0.181,0.81,0.616,0.858,0.203
average_M1,0.667,0.667,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M3,1.0,1.0,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M5,1.0,1.0,0.667,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_position,1.333,1.333,2.5,1.0,1.0,0.0,1.0,0.667,1.0,7.0
average_length (x of gs),0.8,1.178,0.967,0.578,0.533,0.933,0.422,0.666,0.644,1.467


Results for experiment ignore_all_Buoy_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_Buoy_session_failed, which has 36 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.491,0.332,0.03,0.306,0.399,0.574,0.796,0.705,0.745,0.26
average_recall,0.631,0.494,0.033,0.286,0.386,0.458,0.345,0.453,0.555,0.552
average_f1-score,0.552,0.397,0.031,0.296,0.392,0.509,0.481,0.542,0.636,0.353
average_NDCG,0.654,0.519,0.035,0.346,0.477,0.54,0.529,0.612,0.767,0.471
average_M1,0.472,0.361,0.028,0.306,0.444,0.472,0.556,0.602,0.778,0.25
average_M3,0.806,0.611,0.028,0.472,0.556,0.667,0.583,0.741,0.972,0.444
average_M5,0.833,0.667,0.056,0.472,0.583,0.667,0.583,0.741,0.972,0.611
average_position,1.567,1.792,2.5,1.529,1.429,1.333,1.048,1.194,1.2,2.667
average_length (x of gs),1.511,1.794,0.967,0.98,1.136,0.983,0.481,0.765,0.831,2.419


Results for experiment ignore_all_Buoy_no_disease_found, which has 32 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.474,0.353,0.034,0.344,0.406,0.605,0.786,0.715,0.753,0.228
average_recall,0.627,0.482,0.038,0.322,0.412,0.483,0.349,0.454,0.529,0.532
average_f1-score,0.54,0.408,0.036,0.333,0.409,0.537,0.483,0.547,0.621,0.319
average_NDCG,0.663,0.491,0.039,0.389,0.505,0.579,0.561,0.641,0.784,0.485
average_M1,0.5,0.375,0.031,0.344,0.5,0.562,0.625,0.677,0.844,0.281
average_M3,0.812,0.562,0.031,0.531,0.594,0.688,0.656,0.781,1.0,0.469
average_M5,0.844,0.594,0.062,0.531,0.594,0.688,0.656,0.781,1.0,0.625
average_position,1.481,1.632,2.5,1.529,1.211,1.182,1.048,1.129,1.156,2.696
average_length (x of gs),1.541,1.693,0.967,0.98,1.16,0.953,0.48,0.74,0.786,2.551


Results for experiment ignore_all_K health_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_K health_session_failed, which has 35 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.492,0.337,0.031,0.281,0.453,0.581,0.762,0.694,0.738,0.268
average_recall,0.649,0.47,0.034,0.256,0.454,0.452,0.336,0.448,0.557,0.563
average_f1-score,0.56,0.393,0.032,0.268,0.453,0.508,0.466,0.536,0.635,0.363
average_NDCG,0.656,0.468,0.036,0.325,0.548,0.531,0.502,0.599,0.764,0.488
average_M1,0.457,0.286,0.029,0.286,0.514,0.457,0.514,0.581,0.771,0.257
average_M3,0.8,0.543,0.029,0.457,0.629,0.657,0.543,0.724,0.971,0.457
average_M5,0.829,0.6,0.057,0.457,0.657,0.657,0.543,0.724,0.971,0.6
average_position,1.586,1.905,2.5,1.562,1.391,1.348,1.053,1.202,1.206,2.75
average_length (x of gs),1.55,1.736,0.967,0.997,1.157,0.978,0.49,0.77,0.841,2.417


Results for experiment ignore_all_K health_no_disease_found, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_WebMD_none, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_WebMD_session_failed, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


Results for experiment ignore_all_WebMD_no_disease_found, which has 39 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.577,0.786,0.709,0.765,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.457,0.34,0.448,0.547,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.51,0.475,0.541,0.638,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.546,0.529,0.615,0.769,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.487,0.564,0.615,0.795,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.667,0.59,0.744,0.974,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.667,0.59,0.744,0.974,0.615
average_position,1.545,1.76,2.5,1.529,1.391,1.308,1.043,1.178,1.184,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,0.972,0.479,0.751,0.802,2.46


In [61]:
displayResults({key:val for key, val in results.items() if 'any' in key})

Results for experiment ignore_common_any_session_failed, which has 26 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.569,0.351,0.042,0.34,0.509,0.569,0.853,0.722,0.743,0.293
average_recall,0.713,0.505,0.046,0.306,0.483,0.487,0.363,0.475,0.576,0.598
average_f1-score,0.633,0.414,0.044,0.322,0.496,0.525,0.509,0.561,0.649,0.393
average_NDCG,0.755,0.533,0.048,0.414,0.644,0.579,0.605,0.659,0.794,0.54
average_M1,0.577,0.346,0.038,0.385,0.615,0.5,0.654,0.654,0.808,0.308
average_M3,0.885,0.615,0.038,0.577,0.769,0.731,0.692,0.808,1.0,0.538
average_M5,0.923,0.692,0.077,0.577,0.808,0.731,0.692,0.808,1.0,0.692
average_position,1.5,1.889,2.5,1.533,1.429,1.368,1.056,1.205,1.192,2.421
average_length (x of gs),1.464,1.785,0.967,0.987,1.114,1.047,0.487,0.797,0.856,2.382


Results for experiment ignore_common_any_no_disease_found, which has 3 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.889,0.5,0.361,0.167,0.5,0.389,1.0,0.722,0.778,0.143
average_recall,0.689,0.4,0.4,0.067,0.244,0.356,0.422,0.43,0.511,0.2
average_f1-score,0.776,0.444,0.38,0.096,0.328,0.372,0.594,0.528,0.617,0.167
average_NDCG,0.757,0.662,0.418,0.226,0.507,0.181,0.81,0.616,0.858,0.203
average_M1,0.667,0.667,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M3,1.0,1.0,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M5,1.0,1.0,0.667,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_position,1.333,1.333,2.5,1.0,1.0,0.0,1.0,0.667,1.0,7.0
average_length (x of gs),0.8,1.178,0.967,0.578,0.533,0.933,0.422,0.666,0.644,1.467


Results for experiment ignore_less common_any_session_failed, which has 7 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.576,0.5,0.573,0.643,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.31,0.262,0.357,0.5,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.403,0.344,0.437,0.563,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.347,0.144,0.375,0.635,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.286,0.0,0.286,0.571,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.429,0.0,0.429,0.857,0.286
average_position,2.333,2.0,0.0,2.0,0.0,1.333,0.0,0.889,1.333,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,0.786,0.5,0.722,0.881,2.31


Results for experiment ignore_less common_any_no_disease_found, which has 0 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_recall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_f1-score,,,,,,,,,,
average_NDCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_position,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_length (x of gs),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Results for experiment ignore_all_any_session_failed, which has 33 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.499,0.327,0.033,0.298,0.435,0.571,0.778,0.69,0.722,0.269
average_recall,0.643,0.484,0.036,0.272,0.421,0.449,0.341,0.45,0.56,0.552
average_f1-score,0.562,0.39,0.034,0.284,0.428,0.503,0.474,0.536,0.631,0.362
average_NDCG,0.65,0.488,0.038,0.345,0.521,0.53,0.507,0.599,0.76,0.484
average_M1,0.455,0.303,0.03,0.303,0.485,0.455,0.515,0.576,0.758,0.273
average_M3,0.788,0.576,0.03,0.485,0.606,0.667,0.545,0.727,0.97,0.455
average_M5,0.818,0.636,0.061,0.485,0.636,0.667,0.545,0.727,0.97,0.606
average_position,1.593,1.905,2.5,1.562,1.429,1.364,1.056,1.213,1.219,2.636
average_length (x of gs),1.522,1.811,0.967,0.997,1.136,0.992,0.49,0.781,0.862,2.367


Results for experiment ignore_all_any_no_disease_found, which has 3 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,Mohmmad Almadani,Noor Joudeh,average_doctor,Tala Hammouri,WebMD
average_precision,0.889,0.5,0.361,0.167,0.5,0.389,1.0,0.722,0.778,0.143
average_recall,0.689,0.4,0.4,0.067,0.244,0.356,0.422,0.43,0.511,0.2
average_f1-score,0.776,0.444,0.38,0.096,0.328,0.372,0.594,0.528,0.617,0.167
average_NDCG,0.757,0.662,0.418,0.226,0.507,0.181,0.81,0.616,0.858,0.203
average_M1,0.667,0.667,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M3,1.0,1.0,0.333,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_M5,1.0,1.0,0.667,0.333,0.667,0.0,1.0,0.667,1.0,0.0
average_position,1.333,1.333,2.5,1.0,1.0,0.0,1.0,0.667,1.0,7.0
average_length (x of gs),0.8,1.178,0.967,0.578,0.533,0.933,0.422,0.666,0.644,1.467


Now let us combine the individual results of the apps.

In [62]:
combinedResults = {}
for label, keys in combineLabels.items():
    collectedResults = {}
    for key in keys:
        result = results[key]
        app = key.split('_')[2]
        collectedResults[app] = result.loc[:,app]
    combinedResults[label] = pd.DataFrame(
        collectedResults,
        index=[
            f"average_{x}" for x in [
                "precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5", "position", "length (x of gs)"
            ]
        ]
        )

In [63]:
displayResults(combinedResults,printNumCases=False)

Results for experiment ignore_common_none is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.538,0.369,0.034,0.312,0.46,0.272
average_recall,0.694,0.515,0.038,0.291,0.455,0.595
average_f1-score,0.606,0.43,0.036,0.301,0.457,0.373
average_NDCG,0.752,0.551,0.039,0.37,0.586,0.527
average_M1,0.594,0.406,0.031,0.344,0.562,0.281
average_M3,0.906,0.625,0.031,0.5,0.688,0.531
average_M5,0.938,0.688,0.062,0.5,0.719,0.688
average_position,1.467,1.727,2.5,1.5,1.391,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,2.493


Results for experiment ignore_common_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.538,0.369,0.035,0.345,0.526,0.272
average_recall,0.694,0.515,0.039,0.321,0.52,0.595
average_f1-score,0.606,0.43,0.037,0.333,0.523,0.373
average_NDCG,0.752,0.551,0.04,0.408,0.669,0.527
average_M1,0.594,0.406,0.032,0.379,0.643,0.281
average_M3,0.906,0.625,0.032,0.552,0.786,0.531
average_M5,0.938,0.688,0.065,0.552,0.821,0.688
average_position,1.467,1.727,2.5,1.5,1.391,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,2.493


Results for experiment ignore_common_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.538,0.369,0.361,0.385,0.46,0.272
average_recall,0.694,0.515,0.4,0.358,0.455,0.595
average_f1-score,0.606,0.43,0.38,0.371,0.457,0.373
average_NDCG,0.752,0.551,0.418,0.455,0.586,0.527
average_M1,0.594,0.406,0.333,0.423,0.562,0.281
average_M3,0.906,0.625,0.333,0.615,0.688,0.531
average_M5,0.938,0.688,0.667,0.615,0.719,0.688
average_position,1.467,1.727,2.5,1.5,1.391,2.542
average_length (x of gs),1.486,1.695,0.967,0.967,1.143,2.493


Results for experiment ignore_less common_none is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.286
average_position,2.333,2.0,0.0,2.0,0.0,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,2.31


Results for experiment ignore_less common_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.243,0.238,0.0,0.143,0.16,0.18
average_recall,0.381,0.405,0.0,0.143,0.19,0.381
average_f1-score,0.297,0.3,,0.143,0.174,0.244
average_NDCG,0.259,0.32,0.0,0.09,0.063,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.143
average_M3,0.429,0.429,0.0,0.143,0.0,0.143
average_M5,0.429,0.429,0.0,0.143,0.0,0.286
average_position,2.333,2.0,0.0,2.0,0.0,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,2.31


Results for experiment ignore_less common_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.243,0.238,0.0,0.167,0.16,0.18
average_recall,0.381,0.405,0.0,0.167,0.19,0.381
average_f1-score,0.297,0.3,,0.167,0.174,0.244
average_NDCG,0.259,0.32,0.0,0.105,0.063,0.277
average_M1,0.0,0.143,0.0,0.0,0.0,0.143
average_M3,0.429,0.429,0.0,0.167,0.0,0.143
average_M5,0.429,0.429,0.0,0.167,0.0,0.286
average_position,2.333,2.0,0.0,2.0,0.0,4.0
average_length (x of gs),1.738,1.905,0.0,1.028,1.214,2.31


Results for experiment ignore_all_none is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.485,0.345,0.028,0.282,0.406,0.256
average_recall,0.638,0.495,0.031,0.264,0.408,0.557
average_f1-score,0.551,0.407,0.029,0.273,0.407,0.351
average_NDCG,0.663,0.51,0.032,0.319,0.492,0.482
average_M1,0.487,0.359,0.026,0.282,0.462,0.256
average_M3,0.821,0.59,0.026,0.436,0.564,0.462
average_M5,0.846,0.641,0.051,0.436,0.59,0.615
average_position,1.545,1.76,2.5,1.529,1.391,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,2.46


Results for experiment ignore_all_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.485,0.345,0.029,0.306,0.453,0.256
average_recall,0.638,0.495,0.032,0.286,0.454,0.557
average_f1-score,0.551,0.407,0.03,0.296,0.453,0.351
average_NDCG,0.663,0.51,0.033,0.346,0.548,0.482
average_M1,0.487,0.359,0.026,0.306,0.514,0.256
average_M3,0.821,0.59,0.026,0.472,0.629,0.462
average_M5,0.846,0.641,0.053,0.472,0.657,0.615
average_position,1.545,1.76,2.5,1.529,1.391,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,2.46


Results for experiment ignore_all_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.485,0.345,0.361,0.344,0.406,0.256
average_recall,0.638,0.495,0.4,0.322,0.408,0.557
average_f1-score,0.551,0.407,0.38,0.333,0.407,0.351
average_NDCG,0.663,0.51,0.418,0.389,0.492,0.482
average_M1,0.487,0.359,0.333,0.344,0.462,0.256
average_M3,0.821,0.59,0.333,0.531,0.564,0.462
average_M5,0.846,0.641,0.667,0.531,0.59,0.615
average_position,1.545,1.76,2.5,1.529,1.391,2.704
average_length (x of gs),1.532,1.733,0.967,0.98,1.157,2.46
