In [33]:
# General imports
import os
import pandas as pd
import glob
import csv
import json

In [None]:
# Machine learning imports
from sklearn.model_selection import GroupKFold # need to group by patient id across modalities

## Motum dataset info
- radiomics are stored in `motum/derivatives/*/*_radiomics.csv`
    - there is one for each modality (t1,t1ce,t2,flair) for each patient (e.g. sub-0001)
- corresponding json w scanner info in `motum/derivatives/*/*_params.json`
    - similar to above there is one for each modality (t1,t1ce,t2,flair) for each patient 
- clinical metadata is stored in `motum/Participants.xlsx`

GOAL:
- can we distinguish between gliomas (cancers originating in the brain) and brain metastasis

### Participant metadata

In [None]:
participants_file = "../motum/Participants.xlsx"
md = pd.read_excel(participants_file)
print("Shape:", md.shape)

# if not origin in the brain, glioma else metastasis
# manually checked to confirm
#  '/' for [Oligodendroglioma, Astrocytoma, Glioblastoma]

md["label"] = md["Origin"].apply(lambda x: 'glioma' if x == '/' else 'metastasis')
print("Shape:", md.shape)
md.head()


Shape: (67, 18)
Shape: (67, 19)


Unnamed: 0,ID,Sex,Age at MRI,WHO CNS Grade,Origin,Pathology diagnosis,IDH,GFAP,P53,Ki-67,Olig-2,Surgery or Biopsy,Extent of Resection (EOR),Molecular Result,Image rating: FLAIR,Image rating: T1,Image rating: T1-ce,Image rating: T2,label
0,sub-0001,Male,79,3,/,Oligodendroglioma,wildtype,+,+/-,35%+,+,Surgery,Near total resection (NTR),/,2,2,1,1,glioma
1,sub-0002,Male,54,3,/,Oligodendroglioma,wildtype,+,partially+,30%+,+,Surgery,Subtotal resection (STR),/,2,2,2,2,glioma
2,sub-0003,Male,78,3,/,Oligodendroglioma,mutant,+,+,30%+,+,Surgery,Gross total resection (GTR),/,2,2,2,2,glioma
3,sub-0004,Male,65,3,/,Astrocytoma,wildtype,+,+,10%+,+,Surgery,Subtotal resection (STR),/,2,1,1,1,glioma
4,sub-0005,Male,48,3,/,Oligodendroglioma,mutant,+,partially+,40%+,+,Surgery,Gross total resection (GTR),/,2,2,2,2,glioma


## Compile Radiomics metadata

In [None]:
full_header = [
    "Image",
    "Mask",
    "diagnostics_Versions_PyRadiomics",
    "diagnostics_Versions_Numpy",
    "diagnostics_Versions_SimpleITK",
    "diagnostics_Versions_PyWavelet",
    "diagnostics_Versions_Python",
    "diagnostics_Configuration_Settings",
    "diagnostics_Configuration_EnabledImageTypes",
    "diagnostics_Image-original_Hash",
    "diagnostics_Image-original_Dimensionality",
    "diagnostics_Image-original_Spacing",
    "diagnostics_Image-original_Size",
    "diagnostics_Image-original_Mean",
    "diagnostics_Image-original_Minimum",
    "diagnostics_Image-original_Maximum",
    "diagnostics_Mask-original_Hash",
    "diagnostics_Mask-original_Spacing",
    "diagnostics_Mask-original_Size",
    "diagnostics_Mask-original_BoundingBox",
    "diagnostics_Mask-original_VoxelNum",
    "diagnostics_Mask-original_VolumeNum",
    "diagnostics_Mask-original_CenterOfMassIndex",
    "diagnostics_Mask-original_CenterOfMass",
    "original_shape_Elongation",
    "original_shape_Flatness",
    "original_shape_LeastAxisLength",
    "original_shape_MajorAxisLength",
    "original_shape_Maximum2DDiameterColumn",
    "original_shape_Maximum2DDiameterRow",
    "original_shape_Maximum2DDiameterSlice",
    "original_shape_Maximum3DDiameter",
    "original_shape_MeshVolume",
    "original_shape_MinorAxisLength",
    "original_shape_Sphericity",
    "original_shape_SurfaceArea",
    "original_shape_SurfaceVolumeRatio",
    "original_shape_VoxelVolume",
    "original_firstorder_10Percentile",
    "original_firstorder_90Percentile",
    "original_firstorder_Energy",
    "original_firstorder_Entropy",
    "original_firstorder_InterquartileRange",
    "original_firstorder_Kurtosis",
    "original_firstorder_Maximum",
    "original_firstorder_MeanAbsoluteDeviation",
    "original_firstorder_Mean",
    "original_firstorder_Median",
    "original_firstorder_Minimum",
    "original_firstorder_Range",
    "original_firstorder_RobustMeanAbsoluteDeviation",
    "original_firstorder_RootMeanSquared",
    "original_firstorder_Skewness",
    "original_firstorder_TotalEnergy",
    "original_firstorder_Uniformity",
    "original_firstorder_Variance",
    "original_glcm_Autocorrelation",
    "original_glcm_ClusterProminence",
    "original_glcm_ClusterShade",
    "original_glcm_ClusterTendency",
    "original_glcm_Contrast",
    "original_glcm_Correlation",
    "original_glcm_DifferenceAverage",
    "original_glcm_DifferenceEntropy",
    "original_glcm_DifferenceVariance",
    "original_glcm_Id",
    "original_glcm_Idm",
    "original_glcm_Idmn",
    "original_glcm_Idn",
    "original_glcm_Imc1",
    "original_glcm_Imc2",
    "original_glcm_InverseVariance",
    "original_glcm_JointAverage",
    "original_glcm_JointEnergy",
    "original_glcm_JointEntropy",
    "original_glcm_MCC",
    "original_glcm_MaximumProbability",
    "original_glcm_SumAverage",
    "original_glcm_SumEntropy",
    "original_glcm_SumSquares",
    "original_gldm_DependenceEntropy",
    "original_gldm_DependenceNonUniformity",
    "original_gldm_DependenceNonUniformityNormalized",
    "original_gldm_DependenceVariance",
    "original_gldm_GrayLevelNonUniformity",
    "original_gldm_GrayLevelVariance",
    "original_gldm_HighGrayLevelEmphasis",
    "original_gldm_LargeDependenceEmphasis",
    "original_gldm_LargeDependenceHighGrayLevelEmphasis",
    "original_gldm_LargeDependenceLowGrayLevelEmphasis",
    "original_gldm_LowGrayLevelEmphasis",
    "original_gldm_SmallDependenceEmphasis",
    "original_gldm_SmallDependenceHighGrayLevelEmphasis",
    "original_gldm_SmallDependenceLowGrayLevelEmphasis",
    "original_glrlm_GrayLevelNonUniformity",
    "original_glrlm_GrayLevelNonUniformityNormalized",
    "original_glrlm_GrayLevelVariance",
    "original_glrlm_HighGrayLevelRunEmphasis",
    "original_glrlm_LongRunEmphasis",
    "original_glrlm_LongRunHighGrayLevelEmphasis",
    "original_glrlm_LongRunLowGrayLevelEmphasis",
    "original_glrlm_LowGrayLevelRunEmphasis",
    "original_glrlm_RunEntropy",
    "original_glrlm_RunLengthNonUniformity",
    "original_glrlm_RunLengthNonUniformityNormalized",
    "original_glrlm_RunPercentage",
    "original_glrlm_RunVariance",
    "original_glrlm_ShortRunEmphasis",
    "original_glrlm_ShortRunHighGrayLevelEmphasis",
    "original_glrlm_ShortRunLowGrayLevelEmphasis",
    "original_glszm_GrayLevelNonUniformity",
    "original_glszm_GrayLevelNonUniformityNormalized",
    "original_glszm_GrayLevelVariance",
    "original_glszm_HighGrayLevelZoneEmphasis",
    "original_glszm_LargeAreaEmphasis",
    "original_glszm_LargeAreaHighGrayLevelEmphasis",
    "original_glszm_LargeAreaLowGrayLevelEmphasis",
    "original_glszm_LowGrayLevelZoneEmphasis",
    "original_glszm_SizeZoneNonUniformity",
    "original_glszm_SizeZoneNonUniformityNormalized",
    "original_glszm_SmallAreaEmphasis",
    "original_glszm_SmallAreaHighGrayLevelEmphasis",
    "original_glszm_SmallAreaLowGrayLevelEmphasis",
    "original_glszm_ZoneEntropy",
    "original_glszm_ZonePercentage",
    "original_glszm_ZoneVariance",
    "original_ngtdm_Busyness",
    "original_ngtdm_Coarseness",
    "original_ngtdm_Complexity",
    "original_ngtdm_Contrast",
    "original_ngtdm_Strength",
    "ManufacturerModelName", # added from *param.json
    "RepetitionTime", # added from *param.json
    "EchoTime", # added from *param.json
    "InversionTime", # added from *param.json
    "FlipAngle", # added from *param.json
    "SliceThickness", # added from *param.json
    "PixelSpacing", # added from *param.json
    "subject_id" # based on dir
]


In [None]:
# check for consistency across all jsons - YES all 7
param_files = glob.glob("**/*_param.json", recursive=True)
# print(param_files)
lens = []
for f in param_files:
    with open(f, 'r') as json_file:
        jfile = json.load(json_file)
        lens.append(len(jfile.keys()))

print(set(lens))

{7}


In [None]:
## ----- parse each radiomics.csv and param.json -----
# os.chdir("../motum/derivatives")
clean_dir = "../clean_radiomics_data"
radiomics_files = glob.glob(f"**/*_radiomics.csv", recursive=True)
# print(radiomics_files)

missing_file_count = 0
missing_type = {
    "t1":[],
    "t1ce":[],
    "t2":[],
    "flair":[]
}
# drop duplicate headers and make clean files for each - pd struggles w this
for f in radiomics_files:
    clean_rows = []
    subject_id = f.split('/')[-2]
    filename = os.path.basename(f)
    modality = filename.split('_')[0]

    # from manual inspec, some files don't have any radiomics data
    # ex: sub-0024/t1ce_radiomics.csv only has image and mask
    missing_fields = set()
    with open(f, 'r') as input_file:
        reader = csv.reader(input_file)
        header = next(reader) # treat first row as header to begin with
        for line in reader:
            image_file = row[0].split('/')[-1]
            if modality not in image_file: # single out problematic files (i.e. mismatch modality)
                continue
            if line[0] == 'Image' and len(line) > len(header):
                header = line

        with open(f"{subject_id}/{modality}_param.json", 'r') as json_file:
            jfile = json.load(json_file)

        new_header = header + list(jfile.keys()) + ['subject_id']


        # if len(full_header) != len(new_header) and len(new_header) == 2:
        if len(full_header) != len(new_header):
            missing_fields = set(full_header) ^ set(new_header)


    with open(f, 'r') as input_file:
        reader = csv.reader(input_file)
        # header = next(reader) # handle earlier

        seen = set()
        for row in reader:
            # different types of rows to skip
            if row[0] == 'Image': # header row or duplicate
                continue

            # if len(row) == 2: # just image and masks
            if len(row) != len(header) and f != "sub-0015/t1ce_radiomics.csv": # skip incomplete entries if there are others 
                continue
            
            # handle files like 'sub-0015/t1ce_radiomics.csv' which have some mismatched radiomics
            image_file = row[0].split('/')[-1]
            if modality not in image_file:
                continue

            if missing_fields:
                assert len(row) == 2, (
                    f"file:{f} has fields beside image and mask:\n{row}"
                )
                missing_file_count += 1
                missing_type[modality].append(subject_id)
                header_len = len(header)
                param_len = len(list(jfile.keys()))
                row.extend(["NA"] * (len(missing_fields)))

          
            row_tuple = tuple(row)
            if row_tuple in seen: # skip duplicate rows
                continue
            # need to somehow handle cases where they're ALMOST identical (e.g. radiomics rounding error)
            seen.add(row_tuple)

            # print(len(row))
            # print(len(header))
            # assert len(full_header) == len(row), ( # make sure equivalent no of entries
            #     f"file:{f}\nheader={len(full_header)}, row={len(row)}\n{row}"
            # )
            # assert len(new_header) == len(row), f"{set(map(str, new_header)) ^ set(map(str, row))}"
            clean_rows.append(row)

    if not clean_rows: # in cases where no correct modality radiomics data was found
        clean_rows.extend(["NA"] * len(header))

    elif len(clean_rows) > 1:
        # multiple rows are identical w rounding diff select subsequent ones
        first_entry = len(clean_rows[0])
        if all(len(r) != first_entry for r in clean_rows):
            print(f"duplicate entries differ in length:{clean_rows}")
            max_ind = clean_rows.index(max(clean_rows, key=len))
            clean_row = clean_rows[max_ind]

        else: # stick with last entry
            clean_row = clean_rows[-1]
    else:
        clean_row = clean_rows[0]


    # fix to handle mising values better
    new_vals = [jfile.get(k, '') for k in jfile.keys()] + [subject_id]
    clean_row.extend(new_vals)

    assert len(full_header) == len(clean_row), (
        f"file:{f}\nheader={len(full_header)}, row={len(clean_row)}\n{clean_row}"
            )

    # write out clean_csv
    with open(f"{clean_dir}/{subject_id}_{filename}", "w", newline="") as outfile:
        csv.writer(outfile).writerow(new_header)
        csv.writer(outfile).writerow(clean_row)

print(f"In total {missing_file_count} files are missing corresponding radiomics data")
print(missing_type)
counts_dict = {key: len(value) for key, value in missing_type.items()}
print(counts_dict)

In total 207 files are missing corresponding radiomics data
{'t1': ['sub-0024', 'sub-0024', 'sub-0048', 'sub-0048', 'sub-0057', 'sub-0057'], 't1ce': ['sub-0015', 'sub-0015', 'sub-0015', 'sub-0012', 'sub-0012', 'sub-0012', 'sub-0024', 'sub-0024', 'sub-0024', 'sub-0023', 'sub-0023', 'sub-0023', 'sub-0048', 'sub-0048', 'sub-0048', 'sub-0046', 'sub-0046', 'sub-0046', 'sub-0041', 'sub-0041', 'sub-0041', 'sub-0022', 'sub-0022', 'sub-0022', 'sub-0013', 'sub-0013', 'sub-0013', 'sub-0014', 'sub-0014', 'sub-0014', 'sub-0040', 'sub-0040', 'sub-0040', 'sub-0047', 'sub-0047', 'sub-0047', 'sub-0049', 'sub-0049', 'sub-0049', 'sub-0054', 'sub-0054', 'sub-0054', 'sub-0053', 'sub-0053', 'sub-0053', 'sub-0065', 'sub-0065', 'sub-0065', 'sub-0062', 'sub-0062', 'sub-0062', 'sub-0009', 'sub-0009', 'sub-0009', 'sub-0036', 'sub-0036', 'sub-0036', 'sub-0031', 'sub-0031', 'sub-0031', 'sub-0038', 'sub-0038', 'sub-0038', 'sub-0007', 'sub-0007', 'sub-0007', 'sub-0063', 'sub-0063', 'sub-0063', 'sub-0064', 'sub-0064'

In [None]:
## ----- agg all correpsonding metadata -----
## now referencing clean files, agg diff data
clean_files = glob.glob(f"{clean_dir}/*radiomics.csv") # can FINALLY parse :')
for i in clean_files:
    df_radio = pd.read_csv(i)
    print(df_radio.head())