# Initial results

In [17]:
import pandas as pd
data = {}
with open("sample.csv", "r") as file:
    caseData = []
    line = file.readline()
    while line:
        while ("Case" not in line) and line:
            line = file.readline()

        id = line.replace(";;;;", "").strip()
        line = file.readline()

        while ";;;;" not in line and line:
            caseData.append(line)
            line = file.readline()

        if caseData:
            data[id] = caseData
        caseData = []
    

def getCase(lines):
    labels = lines[0].strip().split(";")
    data = [[d.lower().strip() for d in line.strip().split(";")] for line in lines[1:]]
    return pd.DataFrame(data, columns=labels).replace({"":None})



Let us calculate the precision and the recall.

In [18]:
import json

mLength = 10
with open("diseases.json","r") as file:
    diseasesPresentInAvey = set(json.load(file))

cases = []
for caseId,case in data.items():
    
    caseDataFrame = getCase(case)
    
    caseDataFrameSet = set(disease.lower() 
                           for disease in caseDataFrame.iloc[:mLength, 0] 
                           if disease is not None)
    # print(caseDataFrameSet)
    if caseDataFrameSet.issubset(diseasesPresentInAvey):
        cases.append(caseDataFrame)
    else:
        print(f"{caseId} was ignored as {set(d for d in caseDataFrameSet if d not in diseasesPresentInAvey)} was not found")
        
cases = {id:getCase(case) for id,case in data.items()}
list(cases.values())[0]


Case # 4 Resp was ignored as {'chronic bronchitis', 'emphysema'} was not found
Case # 5 Cardio was ignored as {'mitral regurgitation'} was not found
Case # 8 Cardio was ignored as {'systemic lupus erythromatosus', 'transient synovitis', 'juvenile idiopathic arthritis'} was not found
Case # 9 Cardio was ignored as {'left heart failure'} was not found
Case # 10 GI was ignored as {'renal colic'} was not found
Case # 21 Hematology was ignored as {'antiphospholipid syndrome', 'acquired thrombophilia', 'uterine abnormalities'} was not found
Case # 22 Hematology was ignored as {'myelodysplastic syndrome', 'aplastic anemia', 'myeloproliferative neoplasm'} was not found
Case # 23 Resp was ignored as {'severe combined immunodeficiency', 'primary ciliary dyskinesia', 'cystic fibrosis'} was not found
Case # 24 Resp was ignored as {'covid-19'} was not found
Case # 25 Resp was ignored as {'upper airway narrowing', 'gerd'} was not found
Case # 26 GI was ignored as {'chronic gastritis'} was not found


Unnamed: 0,Gold standard ddx,Avey,Ada,Babylon,K health
0,acute bronchitis,pulmonary edema,acute bronchitis,,upper respiratory infection
1,asthma,acute bronchitis,common cold,,asthma
2,common cold,pericardial effusion,pneumonia,,acute bronchitis
3,influenza,common cold,acute laryngitis,,pneumonia
4,,asthma,,,
5,,coronavirus disease 2019,,,


In [None]:
import math
def getPrecision(goldStandard:pd.Series, candidate:pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp if tp ==0 else tp/candidate.count()


def getRecall(goldStandard: pd.Series, candidate: pd.Series) -> float:
    tp = sum(int(disease in goldStandard.values and disease is not None)
             for disease in candidate)
    return tp/goldStandard.count()

def getF1Score(precision:float, recall: float) -> float:
    return math.nan if precision+recall == 0 else 2*precision*recall/(precision+recall)


def getNDCG(goldStandard: pd.Series, candidate: pd.Series, scores) -> float:
    def discount(score:float,index:int)->float:
        return (math.pow(2,score)-1)/math.log2(index+1)
        # return (math.pow(2,score)-1)/math.log2(index+1)

    maxDCG = sum(discount(scores[i],i+1) for i in range(len(scores)))

    candidateRelevance = []
    for index,disease in enumerate(candidate):
        goldStandard = list(goldStandard)
        if disease is not None and disease in goldStandard:
            try:
                candidateRelevance.append(discount(scores[goldStandard.index(disease)],index+1))
            except:
                print(goldStandard)
                print(scores)
        else:
            candidateRelevance.append(0)
    
    return sum(candidateRelevance)/maxDCG

def getMScore(goldStandard:pd.Series,candidate:pd.Series,m=1) -> bool:
    return goldStandard.values[0] in candidate.values[:m]

def getPosition(goldStandard:pd.Series,candidate:pd.Series) -> bool:
    return math.nan if goldStandard.values[0] not in candidate.values else 1 + list(candidate.values).index(goldStandard.values[0])

def getScoresCase(case:pd.DataFrame)->pd.DataFrame:
    scores = [
    [getPrecision(case.iloc[:, 0], case.iloc[:, i]) for i in range(1,len(case.columns))], 
    [getRecall(case.iloc[:, 0], case.iloc[:, i]) for i in range(1, len(case.columns))]
    ]

    scores.append([getF1Score(scores[0][i], scores[1][i]) for i in range(len(case.columns)-1)])
    scores.append([getNDCG(case.iloc[:, 0], case.iloc[:, i], list(range(case.iloc[:, 0].count(),0,-1)))
                  for i in range(1, len(case.columns))])

    for m in range(1,6,2):
        scores.append([getMScore(case.iloc[:, 0], case.iloc[:, i],m)
                      for i in range(1, len(case.columns))])

    scores.append([getPosition(case.iloc[:, 0], case.iloc[:, i])
                   for i in range(1, len(case.columns))])

    return pd.DataFrame(scores,columns=case.columns[1:],
    index=["precision","recall","f1-score","NDCG","M1","M3","M5","position"])


# cases[13]
getScoresCase(list(cases.values())[0])


In [None]:
# scores = list(map(getScoresCase,cases))
scores = {id:getScoresCase(case) for id, case in cases.items()}

def getAverage(scores,row:int,col:int)->float:
    values = [score.iloc[row, col] for _,score in scores.items() if not math.isnan(score.iloc[row, col])]
    return 0 if not values else round(sum(values)/len(values),3)

numberOfCases = len(cases)
averageScores = pd.DataFrame(
    [[getAverage(scores, row, col) for col in range(len(next(iter(scores.values())).columns))]
    for row in range(8)],
    columns=next(iter(scores.values())).columns,
    index=[f"average_{x}" for x in ["precision", "recall", "f1-score", "NDCG", "M1", "M3", "M5","position"]])


averageScores



In [None]:
precision = pd.DataFrame(
    [[round(score.iloc[0, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
recall = pd.DataFrame(
    [[round(score.iloc[1, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
f_score = pd.DataFrame(
    [[round(score.iloc[2, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
ndcg = pd.DataFrame(
    [[round(score.iloc[3, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
m1 = pd.DataFrame(
    [[round(score.iloc[4, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
m3 = pd.DataFrame(
    [[round(score.iloc[5, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
m5 = pd.DataFrame(
    [[round(score.iloc[6, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))
position = pd.DataFrame(
    [[round(score.iloc[7, col],3)
      for col in range(len(next(iter(scores.values())).columns))] for score in scores.values()],
    columns=next(iter(scores.values())).columns, index=range(1, len(scores)+1))


with open(f"m{mLength}/precision.csv","w") as file:
    precision.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/recall.csv","w") as file:
    recall.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/f_score.csv","w") as file:
    f_score.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/ndcg.csv","w") as file:
    ndcg.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/m1.csv","w") as file:
    m1.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/m3.csv","w") as file:
    m3.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/m5.csv","w") as file:
    m5.to_csv(file,sep=";",index=False)
with open(f"m{mLength}/position.csv","w") as file:
    position.to_csv(file,sep=";",index=False)
