In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
from pathlib import Path

In [2]:
images_train = pd.read_csv('../data/raw/train_labels/ProstateX-Images-Train.csv')
ktrans_train = pd.read_csv('../data/raw/train_labels/ProstateX-Images-KTrans-Train.csv')
findings_train = pd.read_csv('../data/raw/train_labels/ProstateX-Findings-Train.csv')

In [3]:
# The usual check for successful conversions
dicom2nifti_success = Path.cwd().parent / 'logs/dicom2nifti_successful.txt'
successful_conv = dicom2nifti_success.read_text()
successful_conv = successful_conv.split('\n')
successful_conv = list(filter(None, successful_conv)) # For sanity - remove any empty string(s)

## Building dataframes for different sequences

In [4]:
root_dir = Path.cwd().parent
nifti = root_dir.joinpath('data/generated/nifti') 

def generate_df_for_sequence(sequence_type):
    """
    This function generates a dataframe for all patients in the dataset. Each row contains
    a string that is analgous to the DCMSerDescr label in the other train files. This string
    is generated from the original filename. The second column contains a path object for the 
    resampled nifti file. This table can be joined to the other training files to create one
    large table with the appropriate sampling information.
    """
    patient_data = {}
    patient_folders = [x for x in nifti.iterdir() if x.is_dir()]
    for patient in patient_folders:
        if patient.stem in successful_conv: 
            sequences = [x for x in patient.iterdir() if x.is_dir()]
            for sequence in sequences:
                if sequence.parts[-1] == sequence_type:
                    for item in sequence.rglob('*.*'):
                        
                        def generate_DCMSerDescr_from_filename():
                            # remove extension from path
                            full_name = item.parts[-1]
                            split = full_name.split('.') 
                            name_without_extension = split[0]

                            # remove first num and underscore from path
                            first_underscore = name_without_extension.find('_') + 1
                            value = name_without_extension[first_underscore:]
                            return value
                        
                        def get_path_to_resampled(sequence_type):
                            nifti_resampled = root_dir.joinpath('data/generated/nifti_resampled')
                            sequence_types = [x for x in nifti_resampled.iterdir() if x.is_dir()]
                            for sequence in sequence_types:
                                # check if directory name contains sequence type
                                if sequence_type in str(sequence):
                                    # then get all files in subdirectory
                                    files = sequence.rglob('*.*')          
                                    for file in files:
                                        # then check if filename contains patient_id
                                        if patient.parts[-1] in str(file): 
                                            path_to_resampled = file
                                            
                            return path_to_resampled
                        
                        DCMSerDescr_fn = generate_DCMSerDescr_from_filename()
                        path_to_resampled = get_path_to_resampled(sequence_type)
                        
                        key = patient.parts[-1] # patient_ID
                        value = [DCMSerDescr_fn, path_to_resampled]
                        patient_data[key] = value 
    
    data_frame = pd.DataFrame.from_dict(patient_data, orient = 'index')
    data_frame = data_frame.reset_index()
    data_frame.columns = ['ProxID','DCMSerDescr', 'path_to_resampled_file'] # renaming columns
    return data_frame

## Generating joined dataframes for each sequence type

In [5]:
print('Images:', images_train.columns.values)
print('Ktrans:', ktrans_train.columns.values)
print('Findings:', findings_train.columns.values)

Images: ['ProxID' 'Name' 'fid' 'pos' 'WorldMatrix' 'ijk' 'TopLevel'
 'SpacingBetweenSlices' 'VoxelSpacing' 'Dim' 'DCMSerDescr' 'DCMSerNum']
Ktrans: ['ProxID' 'fid' 'pos' 'WorldMatrix' 'ijk']
Findings: ['ProxID' 'fid' 'pos' 'zone' 'ClinSig']


In [6]:
def generate_df_for_patch_extraction (sequence_df, images_train_df, findings_train_df):
    should_join_DCMSerDescr = True
    images_train_df_subset = pd.DataFrame()
    
    if 'DCMSerDescr' in list(images_train_df.columns.values):
        images_train_df_subset = images_train_df[['ProxID', 'DCMSerDescr', 'fid', 'pos', 'WorldMatrix', 'ijk']]
    else:
        images_train_df_subset = images_train_df
        should_join_DCMSerDesc = False
    
            
    def join_dataframes (sequence_df, images_train_df_subset, findings_train_df):
        
        def preprocess_DCMSerDescr (sequence_df, images_train_df_subset):
            sequence_df.DCMSerDescr_from_fn = sequence_df.DCMSerDescr.apply(lambda x: x.lower())
            images_train_df_subset.DCMSerDescr = images_train_df_subset.DCMSerDescr.apply(lambda x: x.lower())
            return (sequence_df, images_train_df_subset)
        
        if should_join_DCMSerDescr == True:
            seq_df, images_df = preprocess_DCMSerDescr(sequence_df, images_train_df_subset)
            
            first_merge = pd.merge(seq_df, images_df, how = 'left', on = ['ProxID', 'DCMSerDescr'])
            final_merge = pd.merge(first_merge, findings_train_df, how = 'left', on = ['ProxID', 'fid','pos'])
        else:
            first_merge = pd.merge(sequence_df, images_train_df_subset, how = 'left', on = ['ProxID', 'fid', 'pos'])
            final_merge = pd.merge(first_merge, findings_train_df, how = 'left', on = ['ProxID', 'fid', 'pos'])
        
        return final_merge
    
    df = join_dataframes(sequence_df, images_train_df_subset, findings_train_df)
    return df
   
    
    # sequence_df (ProxID, DCMSerDescr)
    # images_train (ProxID, DCMSerDescr, fid, pos, WorldMatrix, ijk) -reduced
    # ktrans_train (ProxID, fid, pos, WorldMatrix, ijk) -complete
    # findings_train (ProxID, fid, pos, zone, ClinSig) -complete    

In [7]:
t2_df = generate_df_for_sequence('t2')
adc_df = generate_df_for_sequence('adc')
bval_df = generate_df_for_sequence('bval')
#ktrans_df = generate_df_for_sequence('ktrans')

In [8]:
t2_df_pe = generate_df_for_patch_extraction(t2_df, images_train, findings_train)
adc_df_pe = generate_df_for_patch_extraction(adc_df, images_train, findings_train)
bval_df_pe = generate_df_for_patch_extraction(bval_df, images_train, findings_train)
#ktrans_df_pe = generate_df_for_patch_extraction(ktrans_df, ktrans_train, findings_train)

  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [9]:
t2_df_pe.head()

Unnamed: 0,ProxID,DCMSerDescr,path_to_resampled_file,fid,pos,WorldMatrix,ijk,zone,ClinSig
0,ProstateX-0005,t2_tse_tra,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-22.0892639160156 25.4668045043945 22.87915420...,"0.5,1.02552e-010,1.10673e-016,-117.278,-1.0216...",190 185 10,TZ,False
1,ProstateX-0005,t2_tse_tra,/Users/alexanders-13mbp/DataProjects/MBioInfor...,0,-14.5174331665039 49.4428329467773 20.78152465...,"0.5,1.02552e-010,1.10673e-016,-117.278,-1.0216...",206 233 10,PZ,True
2,ProstateX-0005,t2_tse_tra,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-38.6276 42.2781 21.4084,"0.5,1.02552e-010,1.10673e-016,-117.278,-1.0216...",157 219 10,PZ,True
3,ProstateX-0002,t2_tse_tra,/Users/alexanders-13mbp/DataProjects/MBioInfor...,2,-2.058 38.6752 -34.6104,"0.5,1.02552e-010,7.98512e-017,-103.784,-9.9634...",203 216 10,PZ,False
4,ProstateX-0002,t2_tse_tra,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-27.0102 41.5467 -26.0469,"0.5,1.02552e-010,7.98512e-017,-103.784,-9.9634...",154 217 12,PZ,True


In [10]:
adc_df_pe.head()

Unnamed: 0,ProxID,DCMSerDescr,path_to_resampled_file,fid,pos,WorldMatrix,ijk,zone,ClinSig
0,ProstateX-0005,ep2d_diff_tra_dyndist_mix_adc,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-22.0892639160156 25.4668045043945 22.87915420...,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",42 63 9,TZ,False
1,ProstateX-0005,ep2d_diff_tra_dyndist_mix_adc,/Users/alexanders-13mbp/DataProjects/MBioInfor...,0,-14.5174331665039 49.4428329467773 20.78152465...,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",46 75 9,PZ,True
2,ProstateX-0005,ep2d_diff_tra_dyndist_mix_adc,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-38.6276 42.2781 21.4084,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",34 71 9,PZ,True
3,ProstateX-0002,ep2d_diff_tra_dyndist_adc,/Users/alexanders-13mbp/DataProjects/MBioInfor...,2,-2.058 38.6752 -34.6104,"2,4.10207e-010,7.98512e-017,-92.5336,-3.98536e...",45 70 9,PZ,False
4,ProstateX-0002,ep2d_diff_tra_dyndist_adc,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-27.0102 41.5467 -26.0469,"2,4.10207e-010,7.98512e-017,-92.5336,-3.98536e...",33 71 12,PZ,True


In [11]:
bval_df_pe.head()

Unnamed: 0,ProxID,DCMSerDescr,path_to_resampled_file,fid,pos,WorldMatrix,ijk,zone,ClinSig
0,ProstateX-0005,ep2d_diff_tra_dyndist_mixcalc_bval,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-22.0892639160156 25.4668045043945 22.87915420...,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",42 63 9,TZ,False
1,ProstateX-0005,ep2d_diff_tra_dyndist_mixcalc_bval,/Users/alexanders-13mbp/DataProjects/MBioInfor...,0,-14.5174331665039 49.4428329467773 20.78152465...,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",46 75 9,PZ,True
2,ProstateX-0005,ep2d_diff_tra_dyndist_mixcalc_bval,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-38.6276 42.2781 21.4084,"2,4.10207e-010,1.10673e-016,-106.028,-4.08646e...",34 71 9,PZ,True
3,ProstateX-0002,ep2d_diff_tra_dyndistcalc_bval,/Users/alexanders-13mbp/DataProjects/MBioInfor...,2,-2.058 38.6752 -34.6104,"2,4.10207e-010,7.98512e-017,-92.5336,-3.98536e...",45 70 9,PZ,False
4,ProstateX-0002,ep2d_diff_tra_dyndistcalc_bval,/Users/alexanders-13mbp/DataProjects/MBioInfor...,1,-27.0102 41.5467 -26.0469,"2,4.10207e-010,7.98512e-017,-92.5336,-3.98536e...",33 71 12,PZ,True


In [12]:
#ktrans_df_pe.head()