In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
from pathlib import Path

In [2]:
images_train = pd.read_csv('../data/raw/train_labels/ProstateX-Images-Train.csv')
ktrans_train = pd.read_csv('../data/raw/train_labels/ProstateX-Images-KTrans-Train.csv')
findings_train = pd.read_csv('../data/raw/train_labels/ProstateX-Findings-Train.csv')

In [3]:
# The usual check for successful conversions
dicom2nifti_success = Path.cwd().parent / 'logs/dicom2nifti_successful.txt'
successful_conv = dicom2nifti_success.read_text()
successful_conv = successful_conv.split('\n')
successful_conv = list(filter(None, successful_conv)) # For sanity - remove any empty string(s)

## Building dataframes for different sequences

In [4]:
root_dir = Path.cwd().parent
nifti = root_dir.joinpath('data/generated/nifti') 

def generate_df_for_sequence(sequence_type):
    """
    This function generates a dataframe for all patients in the dataset. Each row contains
    a string that is analgous to the DCMSerDescr label in the other train files. This string
    is generated from the original filename. The second column contains a path object for the 
    resampled nifti file. This table can be joined to the other training files to create one
    large table with the appropriate sampling information.
    """
    patient_data = {}
    patient_folders = [x for x in nifti.iterdir() if x.is_dir()]
    for patient in patient_folders:
        if patient.stem in successful_conv: 
            sequences = [x for x in patient.iterdir() if x.is_dir()]
            for sequence in sequences:
                if sequence.parts[-1] == sequence_type:
                    for item in sequence.rglob('*.*'):
                        
                        def generate_DCMSerDescr_from_filename():
                            # remove extension from path
                            full_name = item.parts[-1]
                            split = full_name.split('.') 
                            name_without_extension = split[0]

                            # remove first num and underscore from path
                            first_underscore = name_without_extension.find('_') + 1
                            value = name_without_extension[first_underscore:]
                            return value
                        
                        def get_path_to_resampled(sequence_type):
                            nifti_resampled = root_dir.joinpath('data/generated/nifti_resampled')
                            sequence_types = [x for x in nifti_resampled.iterdir() if x.is_dir()]
                            for sequence in sequence_types:
                                # check if directory name contains sequence type
                                if sequence_type in str(sequence):
                                    # then get all files in subdirectory
                                    files = sequence.rglob('*.*')          
                                    for file in files:
                                        # then check if filename contains patient_id
                                        if patient.parts[-1] in str(file): 
                                            path_to_resampled = file
                                            
                            return path_to_resampled
                        
                        DCMSerDescr_fn = generate_DCMSerDescr_from_filename()
                        path_to_resampled = get_path_to_resampled(sequence_type)
                        
                        key = patient.parts[-1] # patient_ID
                        value = [DCMSerDescr_fn, path_to_resampled]
                        patient_data[key] = value 
    
    data_frame = pd.DataFrame.from_dict(patient_data, orient = 'index')
    data_frame = data_frame.reset_index()
    data_frame.columns = ['ProxID','DCMSerDescr', 'path_to_resampled_file'] # renaming columns
    return data_frame

## Generating joined dataframes for each sequence type

In [5]:
print('Images:', images_train.columns.values)
print('Ktrans:', ktrans_train.columns.values)
print('Findings:', findings_train.columns.values)

Images: ['ProxID' 'Name' 'fid' 'pos' 'WorldMatrix' 'ijk' 'TopLevel'
 'SpacingBetweenSlices' 'VoxelSpacing' 'Dim' 'DCMSerDescr' 'DCMSerNum']
Ktrans: ['ProxID' 'fid' 'pos' 'WorldMatrix' 'ijk']
Findings: ['ProxID' 'fid' 'pos' 'zone' 'ClinSig']


In [6]:
def join_dataframes (sequence_df, images_train_df, findings_train_df):

    sequence_df.loc[:,'DCMSerDescr'] = sequence_df.loc[:,'DCMSerDescr'].apply(lambda x: x.lower())
    
    if 'DCMSerDescr' in list(images_train_df.columns.values):
        # Subset to desired columns only and lowercase
        images_train_df.loc[:,'DCMSerDescr'] = images_train_df.loc[:,'DCMSerDescr'].apply(lambda x: x.lower())
        images_train_df = images_train_df[['ProxID', 'DCMSerDescr', 'fid', 'pos', 'WorldMatrix', 'ijk']]
        
        first_merge = pd.merge(sequence_df, images_train_df, how = 'left', on = ['ProxID', 'DCMSerDescr'])    
        final_merge = pd.merge(first_merge, findings_train_df, how = 'left', on = ['ProxID', 'fid','pos'])
    else:
        first_merge = pd.merge(sequence_df, images_train_df, how = 'left', on = ['ProxID'])
        final_merge = pd.merge(first_merge, findings_train_df, how = 'left', on = ['ProxID', 'fid', 'pos'])

    return final_merge

In [7]:
t2_df = generate_df_for_sequence('t2')
adc_df = generate_df_for_sequence('adc')
bval_df = generate_df_for_sequence('bval')
ktrans_df = generate_df_for_sequence('ktrans')

In [8]:
t2_df_pe = join_dataframes(t2_df, images_train, findings_train)

In [9]:
adc_df_pe = join_dataframes(adc_df, images_train, findings_train)

In [10]:
bval_df_pe = join_dataframes(bval_df, images_train, findings_train)

In [12]:
ktrans_df_pe = join_dataframes(ktrans_df, ktrans_train, findings_train)

In [13]:
ktrans_df_pe

Unnamed: 0,ProxID,DCMSerDescr,path_to_resampled_file,fid,pos,WorldMatrix,ijk,zone,ClinSig
0,ProstateX-0005,prostatex-0005-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,0,-14.5174331665039 49.4428329467773 20.78152465...,"1.5,3.07655e-010,1.10673e-016,-117.778,-3.0648...",69 78 8,PZ,True
1,ProstateX-0005,prostatex-0005-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,-22.0892639160156 25.4668045043945 22.87915420...,"1.5,3.07655e-010,1.10673e-016,-117.778,-3.0648...",64 62 8,TZ,False
2,ProstateX-0005,prostatex-0005-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,-38.6276 42.2781 21.4084,"1.5,3.07655e-010,1.10673e-016,-117.778,-3.0648...",53 73 8,PZ,True
3,ProstateX-0002,prostatex-0002-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,-27.0102 41.5467 -26.0469,"1.5,3.07655e-010,7.98512e-017,-104.284,-2.9890...",52 73 11,PZ,True
4,ProstateX-0002,prostatex-0002-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,2,-2.058 38.6752 -34.6104,"1.5,3.07655e-010,7.98512e-017,-104.284,-2.9890...",68 72 8,PZ,False
5,ProstateX-0034,prostatex-0034-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,17.0042 60.678 5.75149,"1.5,3.07655e-010,-1.39642e-016,-85.0813,-2.963...",68 80 12,PZ,False
6,ProstateX-0033,prostatex-0033-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,-0.473669 43.2903 35.9394,"1.5,3.07655e-010,-5.59082e-018,-82.2922,-3.026...",55 75 6,PZ,False
7,ProstateX-0033,prostatex-0033-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,3,24.7403 33.959 34.6019,"1.5,3.07655e-010,-5.59082e-018,-82.2922,-3.026...",71 69 5,TZ,False
8,ProstateX-0032,prostatex-0032-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,19.0001 60.9885 -32.157,"1.5,3.07655e-010,-1.1706e-016,-96.6499,-2.7557...",77 81 14,PZ,False
9,ProstateX-0035,prostatex-0035-ktrans,/Users/alexanders-13mbp/DataProjects/MBI/CISC_...,1,-22.9279 28.8785 -29.5951,"1.5,3.07655e-010,-7.29555e-017,-98.5005,-2.957...",50 66 7,AS,True
