# Results
In this notebook, we define and calculate the statistics of our tests.

In [1]:
# load gold standard

import json
import pandas as pd
fileName = "gold_standard.json"
goldStandard = {}

with open(fileName,'r',encoding='utf-8') as file:
    data = json.load(file)
    goldStandardRaw = {case["id"]: case["differential_diagnosis"].split("\n") for case in data}

for caseId, case in goldStandardRaw.items():
    goldStandard[caseId] = pd.Series(gs.split("-")[-1].strip().lower() for gs in case)
    
goldStandard['220']


0                      polycythemia vera
1    leukemia (chronic myeloid leukemia)
dtype: object

In [2]:
# load results

import json
import pandas as pd
from collections import defaultdict
fileName = "CaseReviewTest-2021-11-09 (2).json"
results = {}

with open(fileName, 'r', encoding='utf-8') as file:
    data = json.load(file)
    resultsCollected = defaultdict(lambda: dict())
    for test in data:
        resultsCollected[test["case_number"]][test["app"]] = \
            [result.strip().lower() for result in test["content"].split(";")]

    resultsNormalized = defaultdict(lambda: {})
    for caseId, result in resultsCollected.items():
        maxSize = max(len(ddx) for ddx in result.values())
        for app, ddx in result.items():
            resultsNormalized[caseId][app] = ddx + \
                [None] * (maxSize - len(ddx))

for caseId, case in resultsNormalized.items():
    results[caseId] = pd.DataFrame(case, columns=sorted(case.keys()))

results['60']


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
0,endometriosis,adenomyosis,,primary dysmenorrhea,endometriosis,primary dysmenorrhea
1,primary dysmenorrhea,endometriosis,,endometriosis,primary dysmenorrhea,uterine fibroids
2,chronic pelvic pain,primary dysmenorrhea,,uterine fibroids,,endometriosis
3,adenomyosis,secondary dysmenorrhea,,,,pelvic inflammatory disease
4,,,,,,cervicitis
5,,,,,,interstitial cystitis
6,,,,,,irritable bowel syndrome
7,,,,,,fibromyalgia


In [3]:
results['13']

Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
0,chronic bronchitis,chronic obstructive pulmonary disease,chronic bronchitis,chronic obstructive pulmonary disease,chronic obstructive pulmonary disease,bronchitis
1,asthma,asthma,stable angina,chronic bronchitis,asthma,pneumonia
2,chronic obstructive pulmonary disease,valvular heart disease,lung cancer,chronic lung issue,,coronavirus
3,,lung cancer,,,,congestive heart failure
4,,acute bronchitis,,,,asthma
5,,pulmonary edema,,,,chronic obstructive pulmonary disease


In [4]:
# append cold standard to case
cases = dict()
for caseId, gs in goldStandard.items():
    if caseId in results:
        cases[caseId] = pd.DataFrame(results[caseId])
        cases[caseId].insert(loc=0, column="gs", value=gs)


In [5]:
cases['89']

Unnamed: 0,gs,Ada,Avey,Babylon,Buoy,K health,WebMD
0,otitis media,otitis media,otitis media,,coronavirus disease 2019,,otitis media
1,otitis media with effusion,common cold,tonsillopharyngitis,,otitis externa,,swimmer's ear
2,otitis externa,burst ear drum,,,,,adenoidal hypertrophy
3,,mastoiditis,,,,,ruptured eardrum
4,,flu,,,,,acute adenoiditis
5,,,,,,,airplane ear
6,,,,,,,acute sinusitis
7,,,,,,,tonsilitis
8,,,,,,,scarlet fever
9,,,,,,,hearing loss


## Let us define the metrics now.

### Terms used
- TP: True positive
- TN: True negative
- FP: False positive
- FN: False negative

### Precision
$$precision = \frac{TP}{TP + FP} = \frac{TP}{\text{length of differential list}}$$

### Recall
$$recall = \frac{TP}{TP + FN} = \frac{TP}{\text{length of the gold standard}}$$

### F1 Score
Suppose $\beta$ defines how important is $recall$ to $precision$ then,
$$fscore_{\beta} = (1 + \beta^2)\frac{precision \cdot recall}{(\beta^2 \cdot precision) + recall}$$
Substituting $\beta = 1$,
$$fscore_{1} = \frac{2 \cdot precision \cdot recall}{ precision + recall}$$

### NDCG
NDCG or Normalized Discounted Cumulative Gain is measure of how accurate the ranking is. In our calculations, we use
$$DCG = \sum_{i=1}^n\frac{2^{relevance_i}-1}{log_2(i+1)}$$
where $n$ is the number of differentials in the returned list and  
$relevance_i = |gold standard| - rank_{gold\ standard}(ddx[i])$ if $ddx[i]$ is present, 0 otherwise.

$$NDCG = \frac{DCG_{ddx}}{DCG_{gold\ standard}}$$

### M Score
M Score determines where the top disease (gold standard) appears in the returned differential.
$$M_i = \text{gold standard[0]} \in \text{ddx[:i]}$$

### Position
Shows the position of the gold standard[0] in the returned differential. 

### Length
$$length = \frac{|ddx|}{|gold\ standard|}$$


In [8]:
import math


def getPrecision(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp if tp == 0 else tp/candidate.count()


def getRecall(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/goldStandard.count()


def getF1Score(precision: float, recall: float) -> float:
    return math.nan if precision+recall == 0 else 2*precision*recall/(precision+recall)


def getNDCG(goldStandard: pd.Series, candidate: pd.Series, scores) -> float:
    def discount(score: float, index: int) -> float:
        return (math.pow(2, score)-1)/math.log2(index+1)
        # return (math.pow(2,score)-1)/math.log2(index+1)

    maxDCG = sum(discount(scores[i], i+1) for i in range(len(scores)))

    candidateRelevance = []
    for index, disease in enumerate(candidate):
        goldStandard = list(goldStandard)
        if disease is not None and disease in goldStandard:
            try:
                candidateRelevance.append(
                    discount(scores[goldStandard.index(disease)], index+1))
            except:
                print(goldStandard)
                print(scores)
        else:
            candidateRelevance.append(0)

    return sum(candidateRelevance)/maxDCG


def getMScore(goldStandard: pd.Series, candidate: pd.Series, m=1) -> bool:
    return goldStandard.values[0] in candidate.values[:m]


def getPosition(goldStandard: pd.Series, candidate: pd.Series) -> bool:
    return math.nan if goldStandard.values[0] not in candidate.values else 1 + list(candidate.values).index(goldStandard.values[0])


def getLength(goldStandard: pd.Series, candidate: pd.Series) -> int:
    return math.nan if candidate.count() == 0 else candidate.count()/goldStandard.count()


def getScoresCase(case: pd.DataFrame) -> pd.DataFrame:
    scores = [
        [getPrecision(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))],
        [getRecall(case.iloc[:, 0], case.iloc[:, i])
         for i in range(1, len(case.columns))]
    ]

    scores.append([getF1Score(scores[0][i], scores[1][i])
                  for i in range(len(case.columns)-1)])
    scores.append([getNDCG(case.iloc[:, 0], case.iloc[:, i], list(range(case.iloc[:, 0].count(), 0, -1)))
                  for i in range(1, len(case.columns))])

    for m in range(1, 6, 2):
        scores.append([getMScore(case.iloc[:, 0], case.iloc[:, i], m)
                      for i in range(1, len(case.columns))])

    scores.append([getPosition(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    scores.append([getLength(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    return pd.DataFrame(scores, columns=case.columns[1:],
                        index=["precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5", "position", "length"])


getScoresCase(cases['89'])


Unnamed: 0,Ada,Avey,Babylon,Buoy,K health,WebMD
precision,0.2,0.5,0,0.5,0,0.066667
recall,0.333333,0.333333,0.0,0.333333,0.0,0.333333
f1-score,0.25,0.4,,0.4,,0.111111
NDCG,0.745253,0.745253,0.0,0.067172,0.0,0.745253
M1,True,True,False,False,False,True
M3,True,True,False,False,False,True
M5,True,True,False,False,False,True
position,1,1,,,,1
length,1.666667,0.666667,0.333333,0.666667,0.333333,5.0
