## Combbine all diagnosis

This method take diagnosis from images, clinical, and diagnosis sheet, and creates one ground truth (where all three agree) and one majority vote (where two agree) diagnosis files. 

In [43]:
import pandas as pd
import math
clinical = pd.read_csv("ADSP_PHC_COGN.csv").rename(columns={"PHASE":"Phase"})
#this file is the metadata file that one can get from downloading MRI images from ADNI
img = pd.read_csv("metadata.csv")
comb = pd.read_csv("DXSUM_PDXCONV_ADNIALL.csv").rename(columns={"PHASE":"Phase"})[["RID", "PTID" , "Phase"]]

In [44]:
def read_diagnose(file_path: str = 'DXSUM_PDXCONV_ADNIALL.csv', verbose=False):
    # Read diagnostic summary
    diagnostic_summary = pd.read_csv(file_path, index_col='PTID')
    diagnostic_summary = diagnostic_summary.sort_values(by=["update_stamp"], ascending=True)
    # Create dictionary
    diagnostic_dict: dict = {}
    for key, data in diagnostic_summary.iterrows():
        # Iterate for each row of the document
        phase: str = data['PHASE']
        diagnosis: float = -1.
        if phase == "ADNI1":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI2" or phase == "ADNIGO":
            dxchange = data['DIAGNOSIS']
            if dxchange == 1 or dxchange == 7 or dxchange == 9:
                diagnosis = 1.
            if dxchange == 2 or dxchange == 4 or dxchange == 8:
                diagnosis = 2.
            if dxchange == 3 or dxchange == 5 or dxchange == 6:
                diagnosis = 3.
        elif phase == "ADNI3":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI4":
            diagnosis = data['DIAGNOSIS']
        else:
            print(f"ERROR: Not recognized study phase {phase}")
            exit(1)
        # Update dictionary
        if not math.isnan(diagnosis):
            diagnostic_dict[key] = diagnosis
    if verbose:
        print_diagnostic_dict_summary(diagnostic_dict)
    return diagnostic_dict


def print_diagnostic_dict_summary(diagnostic_dict: dict):
    print(f"Number of diagnosed patients: {len(diagnostic_dict.items())}\n")
    n_NL = 0
    n_MCI = 0
    n_AD = 0
    for (key, data) in diagnostic_dict.items():
        if data == 1:
            n_NL += 1
        if data == 2:
            n_MCI += 1
        if data == 3:
            n_AD += 1
    print(f"Number of NL patients: {n_NL}\n"
          f"Number of MCI patients: {n_MCI}\n"
          f"Number of AD patients: {n_AD}\n")

In [45]:
d = read_diagnose()
print_diagnostic_dict_summary(d)

Number of diagnosed patients: 3038

Number of NL patients: 1126
Number of MCI patients: 1013
Number of AD patients: 899



In [46]:
new = pd.DataFrame.from_dict(d, orient='index').reset_index()

In [5]:
clinical.head()

Unnamed: 0,Phase,PTID,RID,VISCODE,VISCODE2,EXAMDATE,DIAGNOSIS,DXNORM,DXNODEP,DXMCI,...,DXODES,DXCONFID,ID,SITEID,USERDATE,USERDATE2,DD_CRF_VERSION_LABEL,LANGUAGE_CODE,HAS_QC_ERROR,update_stamp
0,ADNI1,011_S_0002,2,bl,bl,2005-09-29,1.0,1.0,-4.0,-4.0,...,-4.0,4.0,2,107,2005-10-01,,,,,2005-10-01 00:00:00.0
1,ADNI1,011_S_0003,3,bl,bl,2005-09-30,3.0,-4.0,-4.0,-4.0,...,-4.0,3.0,4,107,2005-10-01,,,,,2005-10-01 00:00:00.0
2,ADNI1,011_S_0005,5,bl,bl,2005-09-30,1.0,1.0,-4.0,-4.0,...,-4.0,4.0,6,107,2005-10-01,,,,,2005-10-01 00:00:00.0
3,ADNI1,011_S_0008,8,bl,bl,2005-09-30,1.0,1.0,-4.0,-4.0,...,-4.0,3.0,8,107,2005-10-01,,,,,2005-10-01 00:00:00.0
4,ADNI1,022_S_0007,7,bl,bl,2005-10-06,3.0,-4.0,-4.0,-4.0,...,-4.0,4.0,10,10,2005-10-06,,,,,2005-10-06 00:00:00.0


In [6]:
clinical["year"] = clinical["EXAMDATE"].str[:4]
# clinical.head()

In [7]:
clinical["Subject"] = clinical["PTID"].str.replace("ADNI_", "").str.replace("s", "S")


In [8]:
c = comb.merge(clinical, on = ["RID", "Phase"])

In [9]:
c = c.drop("Subject", axis =1)

In [10]:
c = c.rename(columns = {"PTID":"Subject"})

In [11]:
img["year"] = img["SCANDATE"].str[5:].str.replace("/", "")

In [12]:
img = img.replace(["CN", "MCI", "AD"], [ 0, 1, 2])

In [13]:
# c["DX"] = c["DX"] -1

In [64]:
new[0] = new[0].astype(int) -1

In [66]:
new = new.rename(columns = {"index":"Subject", 0:"GroupN"})

In [67]:
c.head()
c.drop("PTID_y", axis=1)
c = c.rename(columns={"PTID_x" : "Subject"})
img = img.rename(columns={"SUBJECT" : "Subject"})
img.head()

Unnamed: 0,TYPE,Subject,VISIT,MAGSTRENGTH,SEQUENCE,SCANDATE,STUDYID,SERIESID,IMAGEUID
0,Original,037_S_1421,ADNI Screening,1.5,FSE PD/T2,2007-08-27,11613,38600,70852
1,Original,037_S_1421,ADNI Screening,1.5,FSE PD/T2,2007-08-27,11613,38600,70853
2,Original,037_S_1421,ADNI Screening,1.5,MP-RAGE REPEAT,2007-08-27,11613,38601,70854
3,Original,037_S_1421,ADNI Screening,1.5,MP-RAGE,2007-08-27,11613,38602,70855
4,Original,037_S_1421,ADNI1/GO Month 6,1.5,FSE PD/T2,2008-02-20,14713,45954,91143


In [68]:
m = new.merge(c, on = "Subject", how = "outer").merge(img, on = "Subject", how = "outer")

In [69]:
m.columns

Index(['Subject', 'GroupN', 'RID', 'Phase', 'PTID_y', 'VISCODE', 'VISCODE2',
       'EXAMDATE', 'DIAGNOSIS', 'DXNORM', 'DXNODEP', 'DXMCI', 'DXMDES',
       'DXMPTR1', 'DXMPTR2', 'DXMPTR3', 'DXMPTR4', 'DXMPTR5', 'DXMPTR6',
       'DXMDUE', 'DXMOTHET', 'DXDSEV', 'DXDDUE', 'DXAD', 'DXAPP', 'DXAPROB',
       'DXAPOSS', 'DXPARK', 'DXPDES', 'DXPCOG', 'DXPATYP', 'DXDEP', 'DXOTHDEM',
       'DXODES', 'DXCONFID', 'ID', 'SITEID', 'USERDATE', 'USERDATE2',
       'DD_CRF_VERSION_LABEL', 'LANGUAGE_CODE', 'HAS_QC_ERROR', 'update_stamp',
       'year', 'TYPE', 'VISIT', 'MAGSTRENGTH', 'SEQUENCE', 'SCANDATE',
       'STUDYID', 'SERIESID', 'IMAGEUID'],
      dtype='object')

In [72]:
m[["GroupN", "DIAGNOSIS"]]

Unnamed: 0,GroupN,DIAGNOSIS
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,1.0,1.0
...,...,...
3019000,,
3019001,,
3019002,,
3019003,,


In [73]:
m = m[["Subject", "GroupN", "DIAGNOSIS", "Phase"]].drop_duplicates()

In [74]:
m = m.dropna(subset = ["GroupN", "DIAGNOSIS"], how="all").drop_duplicates()
m

Unnamed: 0,Subject,GroupN,DIAGNOSIS,Phase
0,011_S_0002,1.0,1.0,ADNI1
54,011_S_0002,1.0,1.0,ADNIGO
60,011_S_0002,1.0,1.0,ADNI2
66,011_S_0002,1.0,2.0,ADNI2
210,011_S_0002,1.0,2.0,ADNI3
...,...,...,...,...
3018736,033_S_10099,0.0,1.0,ADNI4
3018737,941_S_10103,1.0,2.0,ADNI4
3018738,135_S_10097,1.0,2.0,ADNI4
3018739,019_S_10102,1.0,2.0,ADNI4


In [22]:
# m.loc[m["DIAGNOSIS"].isna() & m["Group"].isna(), "Group"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]
# m.loc[m["DIAGNOSIS"].isna() & m["Group"].isna(), "DX"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]

In [78]:
# m1 = m[m["GroupN"] == m["Group"]]
m3 = m[m["GroupN"] == m["DIAGNOSIS"]]
# m4 = m[m["Group"] == m["DIAGNOSIS"]]
# m2 = m1[m1["Group"] == m1["DX"]]

In [24]:
# m1 = m1[["Subject", "GroupN", "Group", "DX", "Phase"]]
# m1

In [25]:
# m1.loc[m1["DX"].isna(), "DX"] = m1.loc[m1["DX"].isna(), "Group"]

In [26]:
# m3 = m3[["Subject", "GroupN", "Group", "DX", "Phase"]]
# m3

In [27]:
# m3.loc[m3["Group"].isna(), "Group"] = m3.loc[m3["Group"].isna(), "GroupN"]

In [28]:
# m4 = m4[["Subject", "GroupN", "Group", "DX", "Phase"]]
# m4

In [29]:
# m4[m4["GroupN"] != m4["DX"]]

In [30]:
# m2[["Subject", "GroupN", "Group", "DX", "Phase"]]

In [31]:
# m5 = pd.concat([m1,m3,m4])
# i = m5[m5["Group"] == m5["GroupN"]]
# i = i[i["Group"] == i["DX"]]

In [32]:
# i = i.drop_duplicates()

In [33]:
# i

In [34]:
# i[["Subject", "Group", "Phase"]].to_csv("ground_truth.csv")

In [35]:
# m.update(m5[~m5.index.duplicated(keep='first')])

In [36]:
# indexes = m.index

In [37]:
# #if none of the three diagnosis agree, then we set the value to -1
# m["GROUP"] = -1

In [38]:
# for i in indexes:
#     row = m.loc[i]
#     if (row["GroupN"] == row["Group"]):
#         val = row["GroupN"]
        
#         m.loc[i, "GROUP"] = val
#     elif (row["GroupN"] == row["DX"]):
#         val = row["GroupN"]
#         m.loc[i, "GROUP"] = val
        
#     elif (row["Group"] == row["DX"]):
#         val = row["Group"]
#         m.loc[i, "GROUP"] = val
        

In [39]:
# m5 = m5[~m5.index.duplicated(keep='first')]
# m5

In [40]:
# m[m["GROUP"] != -1]

In [82]:
m[["Subject", "GroupN", "DIAGNOSIS", "Phase"]].to_csv("diagnosis_full.csv")