In [1]:
import os
import pandas as pd
import glob 

In [2]:
import scipy
import numpy as np
import scipy.io as spio


def loadmat(filename):
    '''
    this function should be called instead of direct spio.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    '''
    def _check_keys(d):
        '''
        checks if entries in dictionary are mat-objects. If yes
        todict is called to change them to nested dictionaries
        '''
        for key in d:
            if isinstance(d[key], spio.matlab.mio5_params.mat_struct):
                d[key] = _todict(d[key])
        return d

    def _todict(matobj):
        '''
        A recursive function which constructs from matobjects nested dictionaries
        '''
        d = {}
        for strg in matobj._fieldnames:
            elem = matobj.__dict__[strg]
            if isinstance(elem, spio.matlab.mio5_params.mat_struct):
                d[strg] = _todict(elem)
            elif isinstance(elem, np.ndarray):
                d[strg] = _tolist(elem)
            else:
                d[strg] = elem
        return d

    def _tolist(ndarray):
        '''
        A recursive function which constructs lists from cellarrays
        (which are loaded as numpy ndarrays), recursing into the elements
        if they contain matobjects.
        '''
        elem_list = []
        for sub_elem in ndarray:
            if isinstance(sub_elem, spio.matlab.mio5_params.mat_struct):
                elem_list.append(_todict(sub_elem))
            elif isinstance(sub_elem, np.ndarray):
                elem_list.append(_tolist(sub_elem))
            else:
                elem_list.append(sub_elem)
        return elem_list
    data = scipy.io.loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_keys(data)

In [3]:
homedir='/media/raghuram/My Passport/dicom_seg/TCGA-LGG'

In [4]:
os.chdir(homedir)

In [5]:
dicom_file_params_df=pd.read_csv('dicom_file_params.csv')

In [6]:
dicom_file_params_df.shape

(24398, 9)

In [7]:
t1_post_samples_df = pd.read_csv('t1_post_samples.csv')
t1_pre_samples_df = pd.read_csv('t1_pre_samples.csv')
flair_samples = pd.read_csv('flair_samples.csv')
t2_pre_samples = pd.read_csv('t2_pre_samples.csv')

In [8]:
# t1_post_samples_df['Sequence Name']

In [9]:
sequence_mapper_dict = {'T1CE':t1_post_samples_df.values,
               'T1W':t1_pre_samples_df.values,
               'T2W': t2_pre_samples.values,
               'T2F': flair_samples.values}

In [None]:
mapped_sequence = []
for idx, row in dicom_file_params_df.iterrows():
    try:
        if row['scanning_seq_mri'] in sequence_mapper_dict['T1CE']:
            mapped_sequence.append('T1CE')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T1W']:
            mapped_sequence.append('T1W')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T2W']:
            mapped_sequence.append('T2W')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T2F']:
            mapped_sequence.append('T2F')
    except Exception as e:
        print('{}, {} in row {}'.format(e, row['scanning_seq_mri'], idx))

In [None]:
dicom_file_params_df['mat_file_sequence'] = mapped_sequence
dicom_file_params_df.to_csv('dicom_file_params.csv', index=False)

In [None]:
patient_name_list = []
for idx, row in dicom_file_params_df.iterrows():
    try:
        patient_name_list.append(row['filename'].split('/TCGA-LGG')[1].split('/')[1])
    except Exception as e:
        print('Error {} at index {}'.format(e, idx))

In [None]:
dicom_file_params_df['patient_name'] = patient_name_list
dicom_file_params_df.to_csv('dicom_file_params.csv', index=False)

In [10]:
mat_files_dir = '/home/raghuram/Desktop/radiomics/TEXTURES'

In [11]:
os.chdir(mat_files_dir)

In [12]:
mat_files_list = glob.glob('*.mat')

In [13]:
data = loadmat(mat_files_list[0])

In [14]:
mat_files_list[-1 ]

'TCGA-CS-5394_T2W.mat'

In [15]:
dicom_file_params_df['mat_file_name'] = dicom_file_params_df['patient_name']+'_'+dicom_file_params_df['mat_file_sequence']+'.mat'

In [16]:
dicom_file_params_df.drop_duplicates(subset=['mat_file_name'], keep='first', inplace=True)

In [17]:
dicom_file_params_df.shape

(347, 10)

In [18]:
dicom_file_params_df.to_csv('dicom_file_params_df_with_mat_files.csv', index=False)

In [19]:
missing = set(mat_files_list).difference(list(dicom_file_params_df['mat_file_name']))

In [21]:
missing

{'TCGA-CS-4942_T1CE.mat',
 'TCGA-CS-4943_T1CE.mat',
 'TCGA-CS-4944_T1CE.mat',
 'TCGA-CS-5390_T1CE.mat',
 'TCGA-CS-5393_T1W.mat',
 'TCGA-CS-5393_T2W.mat',
 'TCGA-CS-5394_T1CE.mat',
 'TCGA-CS-5395_T1CE.mat',
 'TCGA-CS-5395_T2W.mat',
 'TCGA-CS-5396_T1CE.mat',
 'TCGA-CS-5397_T1CE.mat',
 'TCGA-CS-6665_T1CE.mat',
 'TCGA-CS-6669_T1CE.mat',
 'TCGA-DU-5851_T1CE.mat',
 'TCGA-DU-6399_T1CE.mat',
 'TCGA-DU-6399_T1W.mat',
 'TCGA-DU-6399_T2W.mat',
 'TCGA-DU-6400_T1CE.mat',
 'TCGA-DU-6400_T1W.mat',
 'TCGA-DU-6400_T2F.mat',
 'TCGA-DU-6400_T2W.mat',
 'TCGA-DU-6405_T1CE.mat',
 'TCGA-DU-6405_T1W.mat',
 'TCGA-DU-6405_T2F.mat',
 'TCGA-DU-6405_T2W.mat',
 'TCGA-DU-6408_T1CE.mat',
 'TCGA-DU-6408_T1W.mat',
 'TCGA-DU-6408_T2F.mat',
 'TCGA-DU-6408_T2W.mat',
 'TCGA-DU-6410_T1CE.mat',
 'TCGA-DU-7008_T1CE.mat',
 'TCGA-DU-7008_T1W.mat',
 'TCGA-DU-7008_T2F.mat',
 'TCGA-DU-7008_T2W.mat',
 'TCGA-DU-A5TS_T1CE.mat',
 'TCGA-DU-A5TU_T1CE.mat',
 'TCGA-DU-A5TU_T1W.mat',
 'TCGA-DU-A5TU_T2F.mat',
 'TCGA-DU-A5TU_T2W.mat',
 'TCGA

In [47]:
data = loadmat(mat_files_list[0])

In [69]:
def extract_flatten_features(mat_data):
#     features_flattened = []
    for experiment_, values in data['textures']['List'].items():
        experiment_number = int(experiment_.split('Experiment')[1])
        if experiment_number > 25:
            break
        scale_ = float(values.split(',')[0].split('=')[1])
        algo_ = values.split(',')[1].split('=')[1]
        ng_ = int(values.split(',')[2].split('=')[1])
#         print(experiment_, scale_, algo_, ng_)
        flattened_df = pd.io.json.json_normalize(data['textures'][experiment_], sep='_')
        flattened_features = flattened_df.to_dict(orient='records')[0]
#         print(flattened_df.shape)
#         break
        
            
   

In [70]:
extract_flatten_features(data)

(1, 45)
