# Results
In this notebook, we define and calculate the statistics of our tests.

In [23]:
# load data

import json
import pandas as pd
from collections import defaultdict
import math
import os

def loadData(fileName):
    '''Loading data from result files'''
    with open(f'{fileName}.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
        return data

def normalize(cases):
    '''make all the ddx list of a case of the same length by padding with None'''
    for case in cases.values():
        maxLen = max(len(result) for result in case.values())
        for result in case.values():
            result += [None]*(maxLen-len(result))

        assert len(set(len(result) for result in case.values())) == 1
    
    return cases

def getDataframe(case):
    '''Convert each test case into a dataframe'''
    return pd.DataFrame(
        case,
        columns=['gs',*sorted([key for key in case.keys() if key != 'gs'])],
        index= list(range(1,1+len(list(case.values())[0])))
        )


In [24]:
# We need to make all the differentials of the same length to ease comparison
# We pad the lists with None
data = loadData('allResults-duplicate')
# keys = loadData('collect/maram')
data = {k: v for k, v in data.items() if k in
        ['75', '64', '63', '62', '60', '59', '58', '53', '52', '24', '17','16', '13', '11', '6', '4', '2']}
for caseNum in data:
    for app in data[caseNum]:
        data[caseNum][app] = [r for r in data[caseNum][app] if r]
for caseNum, tests in data.items():
    assert len(
        tests.keys()) == 10, f"app missing in case {caseNum}, {tests.keys()}"
normalizedData = normalize(data)
cases = {int(id): getDataframe(case) for id, case in normalizedData.items()}
caseClassification = loadData('case-classification')
f'We have {len(cases)} cases in the experiment.'


'We have 17 cases in the experiment.'

Let us have a look at all the cases.

In [25]:
from IPython.display import display
for caseNum, case in cases.items():
    print(f"Case number {caseNum}")
    display(case)

Case number 2


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,unstable angina,unstable angina,unstable angina,,,myocardial infarction,stable angina,unstable angina,myocardial infarction,myocardial infarction
2,myocardial infarction,collapsed lung,myocardial infarction,,,musculoskeletal chest pain,unstable angina,myocardial infarction,unstable angina,broken ribs
3,stable angina,myocardial infarction,stable angina,,,cardia arrhythmias,myocardial infarction,,heart failure,costochondritis
4,pericardial effusion,chronic obstructive pulmonary disease,varient angina,,,unstable angina,,,,heartburn/gerd
5,,stable angina,,,,,,,,unstable angina
6,,,,,,,,,,pulmonary embolism


Case number 4


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,asthma,chronic obstructive pulmonary disease,asthma,,heart issue,asthma,asthma,asthma,asthma,asthma
2,chronic obstructive pulmonary disease,asthma,chronic obstructive pulmonary disease,,asthma,gastroesophageal reflux,acute bronchitis,pulmonary edema,pulmonary fibrosis,chronic obstructive pulmonary disease
3,,chronic sarcoidosis,cystic fibrosis,,chronic obstructive pulmonary disease,heart failure,hay fever,,sarcoidosis,pulmonary embolism
4,,paradoxical vocal fold motion,pericardial effusion,,,,,,,pneumonia
5,,,bronchiactasis,,,,,,,bronchitis
6,,,acute bronchitis,,,,,,,diastolic heart failure


Case number 6


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,,chronic obstructive pulmonary disease,heart failure,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,lung tumor,bronchitis
2,pulmonary edema,heart failure,asthma,,heart failure,chronic obstructive pulmonary disease,asthma,,chronic obstructive pulmonary disease,asthma
3,heart failure,asthma,heart failure,,,pleural effusion,heart failure,,bronchiectasis,eczema
4,pericardial effusion,,cardiomyopathy,,,,,,tb,pneumonia
5,asthma,,bronchiectasis,,,,,,,pneumonia
6,,,valvular heart disease,,,,,,,coronavirus disease 2019


Case number 11


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,acute cholangitis,acute cholangitis,acute cholangitis,,,acute cholecystitis,acute cholecystitis,acute cholecystitis,acute cholangitis,acute cholecystitis
2,acute cholecystitis,cholelithiasis,acute cholecystitis,,,acute pancreatitis,acute cholangitis,,acute cholecystitis,acute gastroenteritis
3,hepatitis,acute cholecystitis,acute pancreatitis,,,food poisoning,hepatitis,,peptic ulcer disease,acute pancreatitis
4,acute pancreatitis,acute pancreatitis,acute appendicitis,,,,liver abscess,,,hepatitis
5,cholelithiasis,,cholelithiasis,,,,acute pancreatitis,,,cirrhosis (liver)
6,,,liver abscess,,,,,,,hepatitis


Case number 13


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,bronchiolitis,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,acute bronchitis
2,acute bronchitis,asthma,acute bronchitis,stable angina,chronic lung issue,asthma,acute bronchitis,asthma,viral,pneumonia
3,asthma,,pulmonary edema,small cell lung cancer,,,croup,acute bronchitis,coronavirus disease 2019,pneumonia
4,pulmonary edema,,cystic fibrosis,,,,asthma,,,coronavirus disease 2019
5,common cold,,valvular heart disease,,,,,,,heart failure
6,,,pericardial effusion,,,,,,,asthma
7,,,,,,,,,,chronic obstructive pulmonary disease


Case number 16


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,acute pancreatitis,acute pancreatitis,acute pancreatitis,,,acute cholecystitis,pneumonia,acute pancreatitis,acute pancreatitis,peptic ulcer disease
2,acute cholecystitis,gastritis,acute cholangitis,,,nephrolithiasis,myocardial infarction,,peptic ulcer disease,gastritis
3,gastritis,liver abscess,acute appendicitis,,,acute pancreatitis,acute cholecystitis,,,acute pancreatitis
4,peptic ulcer disease,peptic ulcer disease,acute mesentric ischemia,,,general abdominal pain,hepatitis,,,gastroesophageal reflux
5,chronic pancreatitis,acute cholecystitis,small bowel obstruction,,,bowel obstruction,,,,peptic ulcer disease
6,,,gastritis,,,,,,,acute gastroenteritis
7,,,,,,,,,,acute cholecystitis


Case number 17


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,cholelithiasis,acute cholecystitis,acute cholecystitis,,acute cholecystitis,acute cholecystitis,cholelithiasis,cholelithiasis,biliary colic,hepatitis a
2,acute cholecystitis,cholelithiasis,cholelithiasis,,food poisoning,gastritis,hepatic congestion,,cholelithiasis,acute cholecystitis
3,acute pancreatitis,acute pancreatitis,acute pancreatitis,,cholelithiasis,acute appendicitis,,,peptic ulcer disease,acute pancreatitis
4,acute cholangitis,viral stomach bug in adults,acute cholangitis,,,nephrolithiasis,,,,cholelithiasis
5,,gastritis,acute appendicitis,,,acute pancreatitis,,,,hepatitis
6,,,peptic ulcer disease,,,bowel obstruction,,,,cirrhosis liver


Case number 24


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,coronavirus disease 2019,coronavirus disease 2019,influenza,,coronavirus disease 2019,influenza,acute bronchitis,pneumonia,coronavirus disease 2019,chronic sinusitis
2,pneumonia,influenza,common cold,,,common cold,sinusitis,bronchial asthma exacerbation,influenza,common cold
3,influenza,common cold,coronavirus disease 2019,,,,allergy,,acute bronchitis,asthma
4,common cold,pneumonia,malaria,,,,,,,coronavirus disease 2019
5,acute bronchitis,,pneumonia,,,,,,,pneumonia
6,,,tonsillopharyngitis,,,,,,,influenza


Case number 52


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,brucellosis,chronic fatigue syndrome,mixed connective tissue disease,,lyme disease,meningitis,salmonellosis,cat scratch disease,brucellosis,influenza (flu) adults
2,cat scratch disease,inflammatory bowel disease,brucellosis,,inflammatory condition,inflammatory bowel disease,brucellosis,,enteric fever,bacterial pneumonia
3,,osteomyelitis,systemic lupus erythromatosus,,acute viral hepatitis,mononucleosis,rabies,,rheumatoid arthritis,strep throat
4,,axial spondyloarthritis,malaria,,,dehydration,influenza,,,viral gastroenteritis
5,,heart muscle inflammation,dengue fever,,,lyme disease,coronavirus disease 2019,,,viral pneumonia
6,,,,,,,,,,small bowel obstruction
7,,,,,,,,,,drug allergy


Case number 53


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,mononucleosis,quinsy,mononucleosis,,,,viral upper respiratory tract infection ( com...,tonsillopharyngitis,tonsillopharyngitis,tonsillopharyngitis
2,tonsillopharyngitis,tonsillopharyngitis,tonsillopharyngitis,,,,mononucleosis,,mononucleosis,mononucleosis
3,,mononucleosis,lymphadenitis,,,,scarlet fever,,,influenza
4,,diphtheria,lymphoma,,,,measles,,,appendicitis
5,,scarlet fever,leukemia,,,,,,,bacterial pneumonia
6,,,juvenile idiopathic arthritis,,,,,,,peritonsillar abscess
7,,,,,,,,,,pneumococcal meningitis


Case number 58


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,urinary tract infection,urinary tract infection,urinary tract infection,,urinary tract infection,urinary tract infection,urinary tract infection,urinary tract infection,urinary tract infection,urinary tract infection
2,pyelonephritis,bladder stone,pyelonephritis,,pyelonephritis,pyelonephritis,,,,ureteral calculus
3,,pyelonephritis,endometriosis,,kidney stone,kidney stones,,,,vaginal yeast infection
4,,,bladder cancer,,,,,,,bacterial vaginosis
5,,,nephrolithiasis,,,,,,,diverticulitis
6,,,overactive bladder,,,,,,,pyelonephritis


Case number 59


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,fibroadenoma,fibroadenoma,fibroadenoma,,fibroadenoma,,fibroadenoma,fibroadenoma,fibroadenoma,breast cancer (female)
2,breast cyst,breast cancer,breast cancer,,breast cyst,,breast cyst,,,breast infection
3,,benign breast lumps,breast fat necrosis,,intraductal papilloma,,,,,hereditary breast and ovarian cancer syndrome
4,,,,,,,,,,fibroadenoma
5,,,,,,,,,,breast cyst
6,,,,,,,,,,lymphocytic lymphoma


Case number 60


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,endometriosis,endometriosis,endometriosis,,menstrual cramps,endometriosis,primary dismenorria,primary dysmenorrhea,endometriosis,dysmenorrhea (menstrual cramps)
2,adenomyosis,primary dysmenorrhea,adenomyosis,,endometriosis,dysmenorrhea,endometriosis,fibroids,adenomyosis,uterine fibroids
3,,chronic pelvic pain,secondary dysmenorrhea,,uterine fibroids,,adenomyosis,,pelvic inflammatory disease,endometriosis
4,,adenomyosis,,,,,pelvic inflammatory disease,,,pelvic inflammatory disease (pid)
5,,,,,,,,,,cervicitis
6,,,,,,,,,,interstitial cystitis
7,,,,,,,,,,irritable bowel syndrome
8,,,,,,,,,,fibromyalgia


Case number 62


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,peptic ulcer disease,chronic gastritis,peptic ulcer disease,,chronic dyspepsia or gastritis,gastritis,peptic ulcer disease,peptic ulcer disease,peptic ulcer disease,peptic ulcer disease
2,functional dyspepsia,gastroparesis,acute pancreatitis,,cyclic vomiting syndrome,peptic ulcer disease,gastroesophageal reflux,,gastritis,viral gastroenteritis
3,,functional dyspepsia,cholelithiasis,,peptic ulcer disease,irritable bowel syndrome,esophigiti,,,acute cholecystitis
4,,peptic ulcer disease,myocardial infarction,,,cholecystitis,gastritis,,,gastritis
5,,stomach cancer,small bowel obstruction,,,,,,,acute necrotizing pancreatitis
6,,,,,,,,,,heartburn/gerd
7,,,,,,,,,,peptic ulcer disease
8,,,,,,,,,,bile reflux gastritis
9,,,,,,,,,,irritable bowel syndrome


Case number 63


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,malaria,campylobacter gastroenteritis,dengue fever,,viral gastroenteritis (stomach bug),influenza,malaria,infective gastroenteritis,gastroenteritis,influenza
2,dengue fever,malaria,malaria,,coronavirus disease 2019,gastroenteritis,,,malaria,viral gastroenteritis
3,influenza,boutonneuse fever,acute viral gastroenteritis,,influenza,pneumonia,,,cholera,bacterial pneumonia
4,,viral stomach bug in adults,influenza,,,pyelonephritis,,,,viral pneumonia
5,,borne relapsing fever,acute bacterial gastroenteritis,,,,,,,irritable bowel syndrome
6,,,colitis,,,,,,,lactose intolerance


Case number 64


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,shingles,shingles,shingles,,atypical chest pain,shingles,shingles,shingles,shingles,heart attack (male)
2,contact dermatitis,contact dermatitis,bullous pemphigoid,,atrial fibrillation,pulmonary embolism,,,,broken (fractured) rib(s)
3,,folliculitis,contact dermatitis,,,heart attack,,,,costochondritis
4,,,atopic dermatitis,,,,,,,heartburn/gerd
5,,,cellulitis,,,,,,,unstable angina pectoris
6,,,,,,,,,,pulmonary embolism
7,,,,,,,,,,acute pericarditis
8,,,,,,,,,,esophagitis


Case number 75


Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,WebMD
1,pelvic inflammatory disease,cervicitis,pelvic inflammatory disease,,kidney infection (pyelonephritis),pelvic inflammatory disease,urinary tract infection,cystitis,endometriosis,pelvic inflammatory disease
2,acute appendicitis,pelvic inflammatory disease,ectopic pregnancy,,bacterial vaginosis,pregnancy,vaginitis,pelvic inflammatory disease,pelvic inflammatory disease,miscarriage
3,cystitis,bladder stone,acute appendicitis,,pelvic inflammatory disease,ectopic pregnancy,,pelvic pain syndrome,salpingitis,acute appendicitis
4,,ruptured ovarian cyst,endometriosis,,,,,,acute appendicitis,diverticulitis
5,,chronic pelvic pain,secondary dysmenorrhea,,,,,,,endometrial cancer
6,,,primary dysmenorrhea,,,,,,,influenza
7,,,,,,,,,,ectopic pregnancy


## Let us define the metrics now.

### Terms used
- TP: True positive (correct disease retrieved)
- TN: True negative (wrong disease **not** retrieved)
- FP: False positive (wrong disease retrieved)
- FN: False negative (correct disease **not** retrieved)
- gold standard - the correct list of diseases as determined by collective intelligence of doctors

### Precision
Precision helps us understand how exact our results are. It gives us an intuition about how many wrong diseases (false positives) are being retrieved. It is the ratio *number of correct diseases retrieved* to the *length of the complete list retrieved*.
$$precision = \frac{TP}{TP + FP} = \frac{TP}{\text{length of differential list}}$$

### Recall
Recall is a measure of how many of the correct diseases are being retrieved. It is the ratio *number of correct diseases retrieved* to the *length of the gold standard list*.
$$recall = \frac{TP}{TP + FN} = \frac{TP}{\text{length of the gold standard}}$$

### F1 Score
F1 score is the weighted harmonic mean of *precision* and *recall*. It is a metric that combines *precision* and *recall* and gives us 1 score for easier comparison.

Suppose $\beta$ defines how important is $recall$ to $precision$ then,
$$fscore_{\beta} = (1 + \beta^2)\frac{precision \cdot recall}{(\beta^2 \cdot precision) + recall}$$
Substituting $\beta = 1$,
$$fscore_{1} = \frac{2 \cdot precision \cdot recall}{ precision + recall}$$

### NDCG
NDCG or Normalized Discounted Cumulative Gain is measure of how accurate the ranking is. In our calculations, we use
$$DCG = \sum_{i=1}^n\frac{2^{relevance_i}-1}{log_2(i+1)}$$
where $n$ is the number of differentials in the returned list and  
$relevance_i = |gold standard| - rank_{gold\ standard}(ddx[i])$ if $ddx[i]$ is present, 0 otherwise.

$$NDCG = \frac{DCG_{ddx}}{DCG_{gold\ standard}}$$

### M Score
M Score determines where the top disease (gold standard) appears in the returned differential.
$$M_i = \text{gold standard[0]} \in \text{ddx[:i]}$$

### Position
Shows the position of the gold standard[0] in the returned differential. 

### Length
$$length = \frac{|ddx|}{|gold\ standard|}$$


In [26]:
import math

beta = 1 #used in recall

def getPrecision(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp if tp == 0 else tp/candidate.count()


def getRecall(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/goldStandard.count()


def getF1Score(precision: float, recall: float, beta: float = 1) -> float:
    return math.nan if precision+recall == 0 else \
        (1+beta**2)*precision*recall/(precision*(beta**2)+recall)


def getNDCG(goldStandard: pd.Series, candidate: pd.Series, scores) -> float:
    def discount(score: float, index: int) -> float:
        '''The index is 1 based'''
        return (math.pow(2, score)-1)/math.log2(index+1)

    maxDCG = sum(discount(scores[i], i+1) for i in range(len(scores)))

    candidateRelevance = []
    goldStandard = list(goldStandard)
    for index, disease in enumerate(candidate):
        if disease is not None and disease in goldStandard:
            candidateRelevance.append(
                discount(scores[goldStandard.index(disease)], index+1))
        else:
            candidateRelevance.append(0)

    return sum(candidateRelevance)/maxDCG


def getMScore(goldStandard: pd.Series, candidate: pd.Series, m=1) -> bool:
    return goldStandard.values[0] in candidate.values[:m]


def getPosition(goldStandard: pd.Series, candidate: pd.Series) -> bool:
    return math.nan if goldStandard.values[0] not in candidate.values else\
        1 + list(candidate.values).index(goldStandard.values[0])


def getLength(goldStandard: pd.Series, candidate: pd.Series) -> int:
    return math.nan if candidate.count() == 0 else \
        candidate.count()/goldStandard.count()


def getScoresCase(case: pd.DataFrame, beta:float = 1) -> pd.DataFrame:
    scores = [
        [getPrecision(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))],
        [getRecall(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))]
    ]

    scores.append([getF1Score(scores[0][i], scores[1][i],beta=beta)
                  for i in range(len(case.columns)-1)])
    
    # relevance for a list of 4 is 4, 3, 2, 1
    # relevance for a list of 2 is 2, 1
    scores.append([getNDCG(case.iloc[:, 0], case.iloc[:, i],
                           list(range(case.iloc[:, 0].count(), 0, -1)))
                   for i in range(1, len(case.columns))])

    for m in range(1, 6, 2):
        scores.append([getMScore(case.iloc[:, 0], case.iloc[:, i], m)
                      for i in range(1, len(case.columns))])

    scores.append([getPosition(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    scores.append([getLength(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    return pd.DataFrame(scores, columns=case.columns[1:],
                        index=["precision", "recall", "f1-score", "NDCG",
                               "M1", "M3", "M5", "position", "length (x of gs)"],
                        )

scores = {id:getScoresCase(case) for id, case in cases.items()}

In [27]:
scores[2]
def rankScore(score):
    return score.loc['M1','Avey']

rankedScores = sorted([(k,rankScore(v)) for k,v in scores.items()],key=lambda x: x[1])
# print([a for a,b in rankedScores])
display(dict(rankedScores))

{17: False,
 24: False,
 52: False,
 63: False,
 2: True,
 4: True,
 6: True,
 11: True,
 13: True,
 16: True,
 53: True,
 58: True,
 59: True,
 60: True,
 62: True,
 64: True,
 75: True}

Let us define the experiment now. We will pick which cases to compute startistics for.

In [28]:
from collections import defaultdict
experiments = {}
combineLabels = defaultdict(list)

def addExperiment(caseType,casesToConsider):
    #add experiment to ignore no case
    experiments[f'ignore_{caseType}_none'] = set(casesToConsider)

    for app, classifications in caseClassification['apps'].items():
        for classification, caseNums in classifications.items():
            #record labels to combine later
            combineLabels[f'ignore_{caseType}_{classification}'].append(f'ignore_{caseType}_{app}_{classification}')
            #experiment to ignore the cases for the particular app
            experiments[f'ignore_{caseType}_{app}_{classification}'] = set(casesToConsider) - set(caseNums)
            #experiment to ignore the cases for all the apps
            if f'ignore_{caseType}_any_{classification}' in experiments:
                experiments[f'ignore_{caseType}_any_{classification}'] =\
                 experiments[f'ignore_{caseType}_any_{classification}'] - set(caseNums)
            else:
                experiments[f'ignore_{caseType}_any_{classification}'] = experiments[f'ignore_{caseType}_{app}_{classification}']

addExperiment('common',caseClassification['common'])
addExperiment('less-common',caseClassification['less common'])
addExperiment('all-cases',set(caseClassification['less common']) | set(caseClassification['common']))
    

list(enumerate(experiments.keys()))


[(0, 'ignore_common_none'),
 (1, 'ignore_common_Ada_session_failed'),
 (2, 'ignore_common_any_session_failed'),
 (3, 'ignore_common_Ada_no_disease_found'),
 (4, 'ignore_common_any_no_disease_found'),
 (5, 'ignore_common_Avey_session_failed'),
 (6, 'ignore_common_Avey_no_disease_found'),
 (7, 'ignore_common_Babylon_session_failed'),
 (8, 'ignore_common_Babylon_no_disease_found'),
 (9, 'ignore_common_Buoy_session_failed'),
 (10, 'ignore_common_Buoy_no_disease_found'),
 (11, 'ignore_common_K health_session_failed'),
 (12, 'ignore_common_K health_no_disease_found'),
 (13, 'ignore_common_WebMD_session_failed'),
 (14, 'ignore_common_WebMD_no_disease_found'),
 (15, 'ignore_less-common_none'),
 (16, 'ignore_less-common_Ada_session_failed'),
 (17, 'ignore_less-common_any_session_failed'),
 (18, 'ignore_less-common_Ada_no_disease_found'),
 (19, 'ignore_less-common_any_no_disease_found'),
 (20, 'ignore_less-common_Avey_session_failed'),
 (21, 'ignore_less-common_Avey_no_disease_found'),
 (22, 'ig

In [29]:
def getAverage(scores, row: int, col: int) -> float:
    values = []
    for score in scores.values():
        if not math.isnan(score.iloc[row, col]):
            values.append(score.iloc[row, col])

    return 0 if not values else round(sum(values)/len(values), 3)

results = {}
for label, casesToConsider in experiments.items():
    selectedScores = {id:score for id,score in scores.items() if int(id) in casesToConsider}
    columns = next(iter(scores.values())).columns
    averageScores = pd.DataFrame(
        [
            [
                getAverage(selectedScores, row, col)
                for col in range(len(columns))
            ]
            for row in range(9)
        ],
        columns=columns,
        index=[
            f"average_{x}" for x in [
                "precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5", "position", "length (x of gs)"
            ]
        ]
    )

    # ignore nan for recall
    for col in next(iter(scores.values())).columns:
        p = averageScores.loc["average_precision", col]
        r = averageScores.loc["average_recall", col]
        averageScores.loc["average_f1-score",
                        col] = round(getF1Score(p, r, beta), 3)

    doctorResults = averageScores.loc[:, "MA"] +\
        averageScores.loc[:, "NJ"] +\
        averageScores.loc[:,
                            "TH"]
    doctorResults /= 3.0
    
    averageScores.insert(
        loc=8, column="average_doctor",
        value=doctorResults.round(3),
    )

    results[label] = averageScores


Let us print all the results. The experiments are nomenclatured as follows:
- **[common|uncommon|all]:** means whether only common cases were considered, only uncommon cases were considered, and so on.
- **app name / any:** If an app name is present, then we ignore only those cases that pertain to it. If the label is *any* then we consider all apps in that experiment.
- **failure type**: The apps can fail in 2 ways. Either a session does not complete due to some reason or the app fails to retrieve any diagnosis. If this is set to None, then we ignore the failures and consider all cases under option 1 above.

In [30]:
def displayResults(results,printNumCases=True):
    for label, result in results.items():
        result.to_csv(f'stats/{label}.json',sep=';')
        if printNumCases:
            print(f'Results for experiment {label}, which has {len(set(experiments[label]) & set(scores.keys()))} cases, is')
        else:
            print(f'Results for experiment {label} is')
        display(result)

displayResults({key:val for key, val in results.items() if 'any' not in key})

Results for experiment ignore_common_none, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Ada_session_failed, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Ada_no_disease_found, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Avey_session_failed, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Avey_no_disease_found, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Babylon_session_failed, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Babylon_no_disease_found, which has 1 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,1.0,0.5,0.333,0.5,1.0,0.5,1.0,0.333,0.611,0.429
average_recall,0.4,0.6,0.2,0.2,0.4,0.4,0.6,0.2,0.4,0.6
average_f1-score,0.571,0.545,0.25,0.286,0.571,0.444,0.75,0.25,0.481,0.5
average_NDCG,0.776,0.919,0.679,0.679,0.776,0.273,0.94,0.679,0.631,0.61
average_M1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_position,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,7.0
average_length (x of gs),0.4,1.2,0.6,0.4,0.4,0.8,0.6,0.6,0.667,1.4


Results for experiment ignore_common_Buoy_session_failed, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_Buoy_no_disease_found, which has 11 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.624,0.448,0.03,0.561,0.492,0.583,0.788,0.629,0.667,0.309
average_recall,0.762,0.788,0.018,0.512,0.458,0.495,0.402,0.492,0.463,0.683
average_f1-score,0.686,0.571,0.022,0.535,0.474,0.535,0.532,0.552,0.54,0.425
average_NDCG,0.805,0.885,0.062,0.616,0.607,0.612,0.669,0.734,0.672,0.593
average_M1,0.636,0.818,0.091,0.455,0.545,0.636,0.727,0.727,0.697,0.364
average_M3,0.909,1.0,0.091,0.909,0.727,0.727,0.818,1.0,0.848,0.455
average_M5,1.0,1.0,0.091,0.909,0.727,0.727,0.818,1.0,0.848,0.727
average_position,1.545,1.273,1.0,1.8,1.25,1.125,1.111,1.273,1.17,2.889
average_length (x of gs),1.429,1.918,0.6,1.023,1.14,0.97,0.586,0.871,0.809,2.648


Results for experiment ignore_common_K health_session_failed, which has 14 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.667,0.477,0.024,0.393,0.499,0.551,0.833,0.661,0.682,0.362
average_recall,0.802,0.794,0.014,0.331,0.452,0.479,0.38,0.515,0.458,0.701
average_f1-score,0.728,0.596,0.018,0.359,0.474,0.512,0.522,0.579,0.538,0.477
average_NDCG,0.811,0.896,0.049,0.413,0.596,0.573,0.623,0.745,0.647,0.613
average_M1,0.643,0.857,0.071,0.286,0.429,0.429,0.643,0.643,0.572,0.286
average_M3,0.929,1.0,0.071,0.643,0.643,0.714,0.714,1.0,0.809,0.5
average_M5,1.0,1.0,0.071,0.643,0.714,0.714,0.714,1.0,0.809,0.714
average_position,1.571,1.214,1.0,1.889,1.7,1.4,1.1,1.357,1.286,2.909
average_length (x of gs),1.412,1.857,0.6,0.975,1.077,1.015,0.525,0.845,0.795,2.41


Results for experiment ignore_common_K health_no_disease_found, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_WebMD_session_failed, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_common_WebMD_no_disease_found, which has 15 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.644,0.468,0.022,0.411,0.466,0.581,0.844,0.683,0.703,0.36
average_recall,0.782,0.774,0.013,0.376,0.422,0.513,0.388,0.514,0.472,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.443,0.545,0.532,0.587,0.555,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.557,0.602,0.637,0.75,0.663,0.603
average_M1,0.667,0.867,0.067,0.333,0.4,0.467,0.667,0.667,0.6,0.267
average_M3,0.933,1.0,0.067,0.667,0.6,0.733,0.733,1.0,0.822,0.467
average_M5,1.0,1.0,0.067,0.667,0.667,0.733,0.733,1.0,0.822,0.733
average_position,1.533,1.2,1.0,1.8,1.7,1.364,1.091,1.333,1.263,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,1.014,0.523,0.822,0.786,2.449


Results for experiment ignore_less-common_none, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Ada_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Ada_no_disease_found, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Avey_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Avey_no_disease_found, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Babylon_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Babylon_no_disease_found, which has 0 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_recall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_f1-score,,,,,,,,,,
average_NDCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_position,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_length (x of gs),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Results for experiment ignore_less-common_Buoy_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_Buoy_no_disease_found, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_K health_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_K health_no_disease_found, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_WebMD_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_WebMD_no_disease_found, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_all-cases_none, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Ada_session_failed, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Ada_no_disease_found, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Avey_session_failed, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Avey_no_disease_found, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Babylon_session_failed, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Babylon_no_disease_found, which has 1 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,1.0,0.5,0.333,0.5,1.0,0.5,1.0,0.333,0.611,0.429
average_recall,0.4,0.6,0.2,0.2,0.4,0.4,0.6,0.2,0.4,0.6
average_f1-score,0.571,0.545,0.25,0.286,0.571,0.444,0.75,0.25,0.481,0.5
average_NDCG,0.776,0.919,0.679,0.679,0.776,0.273,0.94,0.679,0.631,0.61
average_M1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_position,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,7.0
average_length (x of gs),0.4,1.2,0.6,0.4,0.4,0.8,0.6,0.6,0.667,1.4


Results for experiment ignore_all-cases_Buoy_session_failed, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_Buoy_no_disease_found, which has 13 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.544,0.433,0.026,0.5,0.436,0.586,0.744,0.583,0.638,0.274
average_recall,0.671,0.782,0.015,0.459,0.413,0.483,0.378,0.481,0.447,0.604
average_f1-score,0.601,0.557,0.019,0.479,0.424,0.53,0.501,0.527,0.519,0.377
average_NDCG,0.717,0.853,0.052,0.526,0.522,0.615,0.587,0.721,0.641,0.51
average_M1,0.538,0.692,0.077,0.385,0.462,0.615,0.615,0.692,0.641,0.308
average_M3,0.846,1.0,0.077,0.769,0.615,0.769,0.692,1.0,0.82,0.385
average_M5,0.923,1.0,0.077,0.769,0.615,0.769,0.692,1.0,0.82,0.615
average_position,1.583,1.385,1.0,1.8,1.25,1.2,1.111,1.308,1.206,2.889
average_length (x of gs),1.529,1.969,0.6,1.058,1.269,1.038,0.56,0.929,0.842,2.664


Results for experiment ignore_all-cases_K health_session_failed, which has 16 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.596,0.461,0.021,0.365,0.452,0.557,0.792,0.62,0.656,0.327
average_recall,0.723,0.789,0.012,0.31,0.417,0.471,0.364,0.503,0.446,0.634
average_f1-score,0.653,0.582,0.015,0.335,0.434,0.51,0.499,0.555,0.521,0.431
average_NDCG,0.739,0.869,0.042,0.365,0.528,0.581,0.563,0.733,0.626,0.543
average_M1,0.562,0.75,0.062,0.25,0.375,0.438,0.562,0.625,0.542,0.25
average_M3,0.875,1.0,0.062,0.562,0.562,0.75,0.625,1.0,0.792,0.438
average_M5,0.938,1.0,0.062,0.562,0.625,0.75,0.625,1.0,0.792,0.625
average_position,1.6,1.312,1.0,1.889,1.7,1.417,1.1,1.375,1.297,2.909
average_length (x of gs),1.496,1.906,0.6,1.021,1.189,1.066,0.511,0.896,0.824,2.452


Results for experiment ignore_all-cases_K health_no_disease_found, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_WebMD_session_failed, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


Results for experiment ignore_all-cases_WebMD_no_disease_found, which has 17 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.58,0.454,0.02,0.382,0.425,0.583,0.804,0.642,0.676,0.327
average_recall,0.71,0.772,0.012,0.351,0.392,0.502,0.372,0.503,0.459,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.408,0.539,0.509,0.564,0.537,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.497,0.605,0.578,0.738,0.64,0.538
average_M1,0.588,0.765,0.059,0.294,0.353,0.471,0.588,0.647,0.569,0.235
average_M3,0.882,1.0,0.059,0.588,0.529,0.765,0.647,1.0,0.804,0.412
average_M5,0.941,1.0,0.059,0.588,0.588,0.765,0.647,1.0,0.804,0.647
average_position,1.562,1.294,1.0,1.8,1.7,1.385,1.091,1.353,1.276,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,1.062,0.511,0.873,0.815,2.484


In [31]:
displayResults({key:val for key, val in results.items() if 'any' in key})

Results for experiment ignore_common_any_session_failed, which has 14 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.667,0.477,0.024,0.393,0.499,0.551,0.833,0.661,0.682,0.362
average_recall,0.802,0.794,0.014,0.331,0.452,0.479,0.38,0.515,0.458,0.701
average_f1-score,0.728,0.596,0.018,0.359,0.474,0.512,0.522,0.579,0.538,0.477
average_NDCG,0.811,0.896,0.049,0.413,0.596,0.573,0.623,0.745,0.647,0.613
average_M1,0.643,0.857,0.071,0.286,0.429,0.429,0.643,0.643,0.572,0.286
average_M3,0.929,1.0,0.071,0.643,0.643,0.714,0.714,1.0,0.809,0.5
average_M5,1.0,1.0,0.071,0.643,0.714,0.714,0.714,1.0,0.809,0.714
average_position,1.571,1.214,1.0,1.889,1.7,1.4,1.1,1.357,1.286,2.909
average_length (x of gs),1.412,1.857,0.6,0.975,1.077,1.015,0.525,0.845,0.795,2.41


Results for experiment ignore_common_any_no_disease_found, which has 1 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,1.0,0.5,0.333,0.5,1.0,0.5,1.0,0.333,0.611,0.429
average_recall,0.4,0.6,0.2,0.2,0.4,0.4,0.6,0.2,0.4,0.6
average_f1-score,0.571,0.545,0.25,0.286,0.571,0.444,0.75,0.25,0.481,0.5
average_NDCG,0.776,0.919,0.679,0.679,0.776,0.273,0.94,0.679,0.631,0.61
average_M1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_position,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,7.0
average_length (x of gs),0.4,1.2,0.6,0.4,0.4,0.8,0.6,0.6,0.667,1.4


Results for experiment ignore_less-common_any_session_failed, which has 2 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.6,0.5,0.333,0.478,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.417,0.25,0.417,0.361,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.492,0.333,0.37,0.398,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.633,0.138,0.648,0.473,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.333,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.667,0.0
average_position,2.0,2.0,0.0,0.0,0.0,1.5,0.0,1.5,1.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,1.417,0.417,1.25,1.028,2.75


Results for experiment ignore_less-common_any_no_disease_found, which has 0 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_recall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_f1-score,,,,,,,,,,
average_NDCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_M5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_position,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
average_length (x of gs),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Results for experiment ignore_all-cases_any_session_failed, which has 16 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,0.596,0.461,0.021,0.365,0.452,0.557,0.792,0.62,0.656,0.327
average_recall,0.723,0.789,0.012,0.31,0.417,0.471,0.364,0.503,0.446,0.634
average_f1-score,0.653,0.582,0.015,0.335,0.434,0.51,0.499,0.555,0.521,0.431
average_NDCG,0.739,0.869,0.042,0.365,0.528,0.581,0.563,0.733,0.626,0.543
average_M1,0.562,0.75,0.062,0.25,0.375,0.438,0.562,0.625,0.542,0.25
average_M3,0.875,1.0,0.062,0.562,0.562,0.75,0.625,1.0,0.792,0.438
average_M5,0.938,1.0,0.062,0.562,0.625,0.75,0.625,1.0,0.792,0.625
average_position,1.6,1.312,1.0,1.889,1.7,1.417,1.1,1.375,1.297,2.909
average_length (x of gs),1.496,1.906,0.6,1.021,1.189,1.066,0.511,0.896,0.824,2.452


Results for experiment ignore_all-cases_any_no_disease_found, which has 1 cases, is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,MA,NJ,TH,average_doctor,WebMD
average_precision,1.0,0.5,0.333,0.5,1.0,0.5,1.0,0.333,0.611,0.429
average_recall,0.4,0.6,0.2,0.2,0.4,0.4,0.6,0.2,0.4,0.6
average_f1-score,0.571,0.545,0.25,0.286,0.571,0.444,0.75,0.25,0.481,0.5
average_NDCG,0.776,0.919,0.679,0.679,0.776,0.273,0.94,0.679,0.631,0.61
average_M1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_M5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,0.0
average_position,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.667,7.0
average_length (x of gs),0.4,1.2,0.6,0.4,0.4,0.8,0.6,0.6,0.667,1.4


Now let us combine the individual results of the apps.

In [32]:
combinedResults = {}
for label, keys in combineLabels.items():
    collectedResults = {}
    for key in keys:
        result = results[key]
        app = key.split('_')[2]
        collectedResults[app] = result.loc[:,app]
    combinedResults[label] = pd.DataFrame(
        collectedResults,
        index=[
            f"average_{x}" for x in [
                "precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5", "position", "length (x of gs)"
            ]
        ]
        )

In [33]:
displayResults(combinedResults,printNumCases=False)

Results for experiment ignore_common_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.644,0.468,0.022,0.411,0.499,0.36
average_recall,0.782,0.774,0.013,0.376,0.452,0.721
average_f1-score,0.706,0.583,0.016,0.393,0.474,0.48
average_NDCG,0.812,0.892,0.045,0.452,0.596,0.603
average_M1,0.667,0.867,0.067,0.333,0.429,0.267
average_M3,0.933,1.0,0.067,0.667,0.643,0.467
average_M5,1.0,1.0,0.067,0.667,0.714,0.733
average_position,1.533,1.2,1.0,1.8,1.7,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,2.449


Results for experiment ignore_common_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.644,0.468,0.333,0.561,0.466,0.36
average_recall,0.782,0.774,0.2,0.512,0.422,0.721
average_f1-score,0.706,0.583,0.25,0.535,0.443,0.48
average_NDCG,0.812,0.892,0.679,0.616,0.557,0.603
average_M1,0.667,0.867,1.0,0.455,0.4,0.267
average_M3,0.933,1.0,1.0,0.909,0.6,0.467
average_M5,1.0,1.0,1.0,0.909,0.667,0.733
average_position,1.533,1.2,1.0,1.8,1.7,3.0
average_length (x of gs),1.418,1.833,0.6,1.023,1.077,2.449


Results for experiment ignore_less-common_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,0.0
average_position,2.0,2.0,0.0,0.0,0.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,2.75


Results for experiment ignore_less-common_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.1,0.35,0.0,0.167,0.125,0.083
average_recall,0.167,0.75,0.0,0.167,0.167,0.167
average_f1-score,0.125,0.477,,0.167,0.143,0.111
average_NDCG,0.235,0.678,0.0,0.027,0.053,0.053
average_M1,0.0,0.0,0.0,0.0,0.0,0.0
average_M3,0.5,1.0,0.0,0.0,0.0,0.0
average_M5,0.5,1.0,0.0,0.0,0.0,0.0
average_position,2.0,2.0,0.0,0.0,0.0,0.0
average_length (x of gs),2.083,2.25,0.0,1.25,1.917,2.75


Results for experiment ignore_all-cases_session_failed is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.58,0.454,0.02,0.382,0.452,0.327
average_recall,0.71,0.772,0.012,0.351,0.417,0.656
average_f1-score,0.638,0.572,0.015,0.366,0.434,0.436
average_NDCG,0.744,0.867,0.04,0.402,0.528,0.538
average_M1,0.588,0.765,0.059,0.294,0.375,0.235
average_M3,0.882,1.0,0.059,0.588,0.562,0.412
average_M5,0.941,1.0,0.059,0.588,0.625,0.647
average_position,1.562,1.294,1.0,1.8,1.7,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,2.484


Results for experiment ignore_all-cases_no_disease_found is


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
average_precision,0.58,0.454,0.333,0.5,0.425,0.327
average_recall,0.71,0.772,0.2,0.459,0.392,0.656
average_f1-score,0.638,0.572,0.25,0.479,0.408,0.436
average_NDCG,0.744,0.867,0.679,0.526,0.497,0.538
average_M1,0.588,0.765,1.0,0.385,0.353,0.235
average_M3,0.882,1.0,1.0,0.769,0.529,0.412
average_M5,0.941,1.0,1.0,0.769,0.588,0.647
average_position,1.562,1.294,1.0,1.8,1.7,3.0
average_length (x of gs),1.496,1.882,0.6,1.058,1.189,2.484
