## Jupyter Notebook for Data Vis and Exploration

In [1]:
import scipy.io 
import os
import numpy as np
import pandas as pd

For each animal file in the MATLAB_Output_Datasets directory, the Parameter Maps, ROIs, and Labels are loaded. 
1. Parameter map volumes are reshaped into 1D numpy arrays, concatenated, and put into dataframe (dfAN) with the .mat file names as the column names. 
    a. Animal number and image index is stored for each row 
    
2. ROI volumes are reshaped into 1D numpy arrays and put into their own dataframe. Then, "getSegmentations" generates a Series for Seg1 (All viable voxels, All non-viable voxels) and Seg2 (Boundary viable voxels, Muscle viable voxels, All non-viable voxels). The Series are output and appended to (dfAN) as Catagory dtype. 
    a. 'allV' = all viable 
    b. 'nonV' = all non-viable
    c. 'bndV' = boundary (enhanced) viable
    d. 'mucV' = muscle viable 
    
3. 

In [4]:
gl_path = '/Users/sjohnson/GitHubRepositories/mp-MRI-Analysis/'

data_path = gl_path + 'MATLAB_Output_DataSets/'
dirlist = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path,f))]
dirlist = np.asarray(dirlist)
ii = [i for i,item in enumerate(dirlist) if "18_" in item]
anNumbers = dirlist[ii]
print(anNumbers)

# Define functions for generating categorical segmentation of each index 

def getSegmentations(df):
    # Segmentation 1+ "Viable" and "Non-viable" only 
    def segmentmethod1(row):
        if row['allV'] == 1:
            return 'allV'
        if row['nonV'] == 1: 
            return 'nonV'

    Seg1 = df.apply(segmentmethod1,axis=1)
    #df['Seg1'] = df.apply(segmentmethod1,axis=1)
   # Seg1 = Seg1.astype('category')
    print('Number of voxels in Seg1:')
    print(Seg1.value_counts())

    # Segmentation 2: "Non-viable", "Viable Boundary","Viable Muscle"
    def segmentmethod2(row):
        if row['nonV'] == 1:
            return 'nonV'
        if row['bndV'] == 1:
            return 'bndV'
        if row['mucV'] == 1:
            return 'mucV'

    Seg2 = df.apply(segmentmethod2,axis=1)
    #Seg2 = Seg2.astype('category')
    #Seg2 = pd.Series(Seg2,dtype="category")
    print('Numer of voxels in Seg2:')
    print(Seg2.value_counts())
          
    return Seg1,Seg2




['18_044' '18_045' '18_047' '18_048' '18_054' '18_060']


In [6]:
#masDF = pd.HDFStore('Master_dataframes.h5')

for anID in anNumbers:

    #### Get the Multi-parametric data for 18_044 ######
    data_path = gl_path + 'MATLAB_Output_DataSets/' + anID + '/Param_maps/'

    # Get the list of .mat files from the animal path 
    file_list = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
    file_list = np.asarray(file_list)

    # Initialize import_array with first file in file_list
    fname = file_list[0]
    mat = scipy.io.loadmat(data_path + fname)
    datarray = mat['paramImg']
    sz = datarray.shape
    datvec = datarray.reshape((sz[0]*sz[1]*sz[2],1))
    import_array = datvec
    colNames = fname[0:-4]

    #define the remaining files in file-listn
    file_listn = file_list[1:]  

    # Loop through files, load data, reshape into an array and append to "import_array"
    for f in file_listn:
        mat = scipy.io.loadmat(data_path + f)
        datarray = mat['paramImg']
        sz = datarray.shape
        datvec = datarray.reshape((sz[0]*sz[1]*sz[2],1))
        import_array = np.concatenate([import_array,datvec],axis=1)
        colName = f[0:-4]
        colNames = np.append(colNames,colName)

    # Create the dataframe
    dfAN = pd.DataFrame(data=import_array, columns=colNames)
    print(anID)
    dfAN['anID'] = anID
    print(dfAN['anID'].head(2))
    dfAN['ImageIdx'] = pd.Series(dfAN.index,index=dfAN.index)

    ###### Get the ROIs and Labels ########
    data_path = data_path + 'ROI_masks/'

    # Get the list of ROI .mat files from the animal path 
    file_list = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
    file_list = np.asarray(file_list)
    ii = [i for i,item in enumerate(file_list) if "Res" in item]
    files = file_list[ii]

    for f in files:
        
        # Dataframe column will include colName for each type of ROI 
        colName = f[10:-4]
     #   print('ROI file type: ' + colName)
        mat = scipy.io.loadmat(data_path + f)
        allV = mat['ROIviaA']  #all viable tissue in muscle and tumor
        nonV = mat['ROInpv']   # all non-perfused/non-viable tissue
        bndV = mat['ROIbnd']   # viable voxels that surround non-perfused volume (mostly enhancing)
        mucV = mat['ROIviaB']  # viable voxels that exclude bndV (mostly healthy mucsle)

        # reshape into vectors 
        sz = allV.shape
        allV = allV.reshape((sz[0]*sz[1]*sz[2]))
        nonV = nonV.reshape((sz[0]*sz[1]*sz[2]))
        bndV = bndV.reshape((sz[0]*sz[1]*sz[2]))
        mucV = mucV.reshape((sz[0]*sz[1]*sz[2]))

        df = pd.DataFrame({'allV': allV,'nonV': nonV,'bndV':bndV,'mucV':mucV},index = dfAN.index)

        colNames = ['Seg1_' + colName, 'Seg2_' + colName, 'LABEL_'+ colName]
        [Seg1, Seg2] = getSegmentations(df)
        
        #seg1 = pd.Series(Seg1,dtype = 'category')
       # seg2 = pd.Series(Seg2,dtype = 'category')

      #  dfAN.loc[:,colNames[0]] = pd.DataFrame(Seg1,index=dfAN.index)
      #  dfAN.loc[:,colNames[1]] = pd.DataFrame(Seg2,index=dfAN.index)
        dfAN[colNames[0]] = pd.Categorical(Seg1)
        dfAN[colNames[1]] = pd.Categorical(Seg2)
      #  print(dfAN[colNames[0]].describe())

        data_path2 = gl_path + 'MATLAB_Output_DataSets/' + anID + '/Param_maps/Labels/'

        # Get the list of .mat files from the animal path 
        file_list2 = [f for f in os.listdir(data_path2) if os.path.isfile(os.path.join(data_path2, f))]
        file_list2 = np.asarray(file_list2)
        ii = [i for i,item in enumerate(file_list2) if colName in item]
        f2 = file_list2[ii]
       # print('NPV file: ' + f2[0])

        mat2 = scipy.io.loadmat(data_path2 + f2[0])
        label = mat2['labelImg']
        sz = label.shape
       # print(label.shape)
        label = label.reshape((sz[0]*sz[1]*sz[2]))
        dfAN[colNames[2]] = pd.Series(label)
        
    
    print(dfAN['anID'].head(2))
    # Delete dataframe entries that are outside the muscle and npv ROIs
    dfANs = dfAN[dfAN.Seg1_Grp_HighRes.notnull()]
    
    # fill in remaining holes with viable muscle 
    values = {'Seg1_Grp_HighRes': 'allV', 'Seg2_Grp_HighRes': 'mucV', 'Seg1_Grp_LowRes': 'allV', 'Seg2_Grp_LowRes': 'mucV','Seg1_Grp_LowRes2':'allV','Seg2_Grp_LowRes2':'mucV','Seg1_Indv_HighRes':'allV','Seg2_Indv_HighRes':'mucV'}
    dfANs = dfANs.fillna(value=values)
    
    # remove any data points with missing data ( NaN) at this point
    dfANs = dfANs.dropna(axis=0,how='any')
    
    # Print info about dfANs
    #print(dfANs.head())
    print(dfANs.info())
    
    # save dfANs to file for specific animal 
    nsave = gl_path + 'Dataframes_and_Stats/' + anID + '_df.pkl'
    dfANs.to_pickle(nsave)
    
    del dfANs 
    del dfAN

18_044
0    18_044
1    18_044
Name: anID, dtype: object
Number of voxels in Seg1:
allV    14966
nonV     1242
dtype: int64
Numer of voxels in Seg2:
mucV    10408
bndV     4558
nonV     1242
dtype: int64
Number of voxels in Seg1:
allV    15137
nonV     1071
dtype: int64
Numer of voxels in Seg2:
mucV    10408
bndV     4729
nonV     1071
dtype: int64
Number of voxels in Seg1:
allV    14628
nonV     1608
dtype: int64
Numer of voxels in Seg2:
mucV    10408
bndV     4220
nonV     1608
dtype: int64
Number of voxels in Seg1:
allV    15238
nonV      974
dtype: int64
Numer of voxels in Seg2:
mucV    10408
bndV     4830
nonV      974
dtype: int64
0    18_044
1    18_044
Name: anID, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15258 entries, 41972 to 151240
Data columns (total 31 columns):
                      15258 non-null float64
CTD_Map               15258 non-null float64
MTP_Map               15258 non-null float64
Post_ADC              15258 non-null float64
Post_T1    

18_054
0    18_054
1    18_054
Name: anID, dtype: object
Number of voxels in Seg1:
allV    19200
nonV       21
dtype: int64
Numer of voxels in Seg2:
mucV    18590
bndV      610
nonV       21
dtype: int64
Number of voxels in Seg1:
allV    19221
dtype: int64
Numer of voxels in Seg2:
mucV    18590
bndV      631
dtype: int64
Number of voxels in Seg1:
allV    19197
nonV       24
dtype: int64
Numer of voxels in Seg2:
mucV    18590
bndV      607
nonV       24
dtype: int64
Number of voxels in Seg1:
allV    19213
nonV        8
dtype: int64
Numer of voxels in Seg2:
mucV    18590
bndV      623
nonV        8
dtype: int64
0    18_054
1    18_054
Name: anID, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19221 entries, 23151 to 179657
Data columns (total 30 columns):
CTD_Map               19221 non-null float64
MTP_Map               19221 non-null float64
Post_ADC              19221 non-null float64
Post_T1               19221 non-null float64
Post_T1w              19221 non-null fl