# Initial results

In [48]:
import pandas as pd
with open("sample.csv","r") as file:
    labels = file.readline().strip().split(",")
    data = [line.strip().split(",") for line in file.readlines()]

df = pd.DataFrame(data, columns=labels).replace({"":None})
df


Unnamed: 0,Gold standard ddx,Avey,Ada,Symptomate
0,Acute bronchitis,Acute bronchitis,Acute bronchitis,Covid-19
1,Common cold,Pulmonary edema,Common cold,Asthma
2,Asthma,Common cold,Pneumonia,Acute bronchitis
3,Influenza,Pericardial effusion,Acute laryngitis,Cardiac tamponade
4,,Influenza,,Pneumonia
5,,Mitral regurgitation,,Pulmonary edema


Let us calculate the precision and the recall.

In [74]:
import math
def getPrecision(goldStandard:pd.Series, candidate:pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/candidate.count()


def getRecall(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/goldStandard.count()

def getF1Score(precision:float, recall: float) -> float:
    return 2*precision*recall/(precision+recall)


def getNDCG(goldStandard: pd.Series, candidate: pd.Series, scores) -> float:
    def discount(score:float,index:int)->float:
        return (math.pow(2,score)-1)/math.log2(index+1)

    maxDCG = sum(discount(scores[i],i+1) for i in range(len(scores)))

    candidateRelevance = []
    for index,disease in enumerate(candidate):
        goldStandard = list(goldStandard)
        if disease is not None and disease in goldStandard:
            candidateRelevance.append(discount(scores[goldStandard.index(disease)],index+1))
        else:
            candidateRelevance.append(0)
    
    return sum(candidateRelevance)/maxDCG

def getScoresCase(case)->pd.DataFrame:
    scores = [
    [getPrecision(df.iloc[:, 0], df.iloc[:, i]) for i in range(1,4)], 
    [getRecall(df.iloc[:, 0], df.iloc[:, i]) for i in range(1, 4)]
    ]

    scores.append([getF1Score(scores[0][i], scores[1][i]) for i in range(3)])
    scores.append([getNDCG(df.iloc[:, 0], df.iloc[:, i], list(range(df.iloc[:, 0].count(),0,-1)))
                  for i in range(1, 4)])

    return pd.DataFrame(scores,columns=labels[1:],index=["precision","recall","f1-score","NDCG"])


getScoresCase(df)


[4, 3, 2, 1] [15.0, 0, 3.5, 0, 0.38685280723454163, 0] 21.347184833073598
[4, 3, 2, 1] [15.0, 4.4165082750002025, 0, 0, 0, 0] 21.347184833073598
[4, 3, 2, 1] [0, 1.8927892607143724, 7.5, 0, 0, 0] 21.347184833073598


Unnamed: 0,Avey,Ada,Symptomate
precision,0.5,0.5,0.333333
recall,0.75,0.5,0.5
f1-score,0.6,0.5,0.4
NDCG,0.884747,0.909558,0.440001
