## Import Libs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%reload_ext autoreload
%autoreload 2
!pip install json5 -q
!pip install SimpleITK -q

import os, sys, runpy, json5, pandas as pd
import numpy as np
import SimpleITK as sitk
from pprint import pprint, pformat
from IPython.display import display
from collections import OrderedDict

[K     |████████████████████████████████| 52.8 MB 118 kB/s 
[?25h

## Clone Dataset

In [None]:
!tar -xvf /content/drive/MyDrive/mah_ws/dataset/MSD/Task01_BrainTumour.tar

## Params

In [None]:
%cd /content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private
current_dir = os.path.abspath(os.path.dirname(globals().get('__file__', './statistics.ipynb.ipynb')))
root_dir  = os.path.abspath(current_dir + "/../")
data_dir = os.path.abspath("/content/Task01_BrainTumour/")
work_dir = os.path.abspath(root_dir + "/private/stat/")
# index_dir = os.path.abspath("/content/Task01_BrainTumour")

print("-" * 10, "Information", "-" * 10)
for v in ['root_dir', 'current_dir', 'data_dir', 'work_dir',]:
    if globals().get(v):
        print(f'+ {v}: {globals()[v]}')
    else:
        print(v)
print()

# import libraries
for path in [root_dir, current_dir]:
    if path in sys.path: sys.path.remove(path)
    sys.path.insert(0, path)


/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private
---------- Information ----------
+ root_dir: /content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor
+ current_dir: /content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private
+ data_dir: /content/Task01_BrainTumour
+ work_dir: /content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat



# Dataset Infor

## Config Info

In [None]:
with open(f'{data_dir}/dataset.json', 'rt') as file:
    data_info = json5.load(file)

In [None]:
def view_dataset_info(info, title = "Dataset Information"):
    print("-" * 10, title, "-" * 10)
    for v in info:
        if v not in ['training', 'test']:
            print(f'{v:18s}: {info[v]}')
        elif v in ['training']:
            print(f'{v:18s}:')
            df_data = pd.DataFrame(info[v])
            display(df_data)
        elif v in ['test']:
            print(f'{v:18s}:')
            df_data = pd.DataFrame(info[v])
            display(df_data)
    print()
    pass # view_dataset_info

In [None]:
view_dataset_info(data_info)

---------- Dataset Information ----------
name              : BRATS
description       : Gliomas segmentation tumour and oedema in on brain images
reference         : https://www.med.upenn.edu/sbia/brats2017.html
licence           : CC-BY-SA 4.0
release           : 2.0 04/05/2018
tensorImageSize   : 4D
modality          : {'0': 'FLAIR', '1': 'T1w', '2': 't1gd', '3': 'T2w'}
labels            : {'0': 'background', '1': 'edema', '2': 'non-enhancing tumor', '3': 'enhancing tumour'}
numTraining       : 484
numTest           : 266
training          :


Unnamed: 0,image,label
0,./imagesTr/BRATS_457.nii.gz,./labelsTr/BRATS_457.nii.gz
1,./imagesTr/BRATS_306.nii.gz,./labelsTr/BRATS_306.nii.gz
2,./imagesTr/BRATS_206.nii.gz,./labelsTr/BRATS_206.nii.gz
3,./imagesTr/BRATS_449.nii.gz,./labelsTr/BRATS_449.nii.gz
4,./imagesTr/BRATS_318.nii.gz,./labelsTr/BRATS_318.nii.gz
...,...,...
479,./imagesTr/BRATS_190.nii.gz,./labelsTr/BRATS_190.nii.gz
480,./imagesTr/BRATS_327.nii.gz,./labelsTr/BRATS_327.nii.gz
481,./imagesTr/BRATS_476.nii.gz,./labelsTr/BRATS_476.nii.gz
482,./imagesTr/BRATS_090.nii.gz,./labelsTr/BRATS_090.nii.gz


test              :


Unnamed: 0,0
0,./imagesTs/BRATS_557.nii.gz
1,./imagesTs/BRATS_549.nii.gz
2,./imagesTs/BRATS_683.nii.gz
3,./imagesTs/BRATS_534.nii.gz
4,./imagesTs/BRATS_545.nii.gz
...,...
261,./imagesTs/BRATS_564.nii.gz
262,./imagesTs/BRATS_519.nii.gz
263,./imagesTs/BRATS_568.nii.gz
264,./imagesTs/BRATS_515.nii.gz





In [None]:
modality = {'0': 'FLAIR', '1': 'T1w', '2': 't1gd', '3': 'T2w'}

In [None]:
df_train = pd.DataFrame(data_info['training'])
df_train["phase"] = "train"

df_valid = pd.DataFrame(data_info['test'])
df_valid["phase"] = "test"
df_valid = df_valid.rename(columns={0: 'image'})

df_data = pd.concat([df_train, df_valid])
# display(df_data)
display(df_data.groupby(['phase']).describe())

Unnamed: 0_level_0,image,image,image,image,label,label,label,label
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
phase,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
test,266,266,./imagesTs/BRATS_557.nii.gz,1,0,0,,
train,484,484,./imagesTr/BRATS_457.nii.gz,1,484,484,./labelsTr/BRATS_457.nii.gz,1.0


In [None]:
file_path = f"{work_dir}/statistics/df_data.xlsx"
df_data.to_excel(file_path, index = False)
print(file_path)

file_path = f"{work_dir}/statistics/df_data.hdf5"
df_data.to_hdf(file_path, key = "data")
print(file_path)



/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/df_data.xlsx
/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/df_data.hdf5


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['image', 'label', 'phase'], dtype='object')]

  pytables.to_hdf(


In [None]:
df_data

Unnamed: 0,image,label,phase
0,./imagesTr/BRATS_457.nii.gz,./labelsTr/BRATS_457.nii.gz,train
1,./imagesTr/BRATS_306.nii.gz,./labelsTr/BRATS_306.nii.gz,train
2,./imagesTr/BRATS_206.nii.gz,./labelsTr/BRATS_206.nii.gz,train
3,./imagesTr/BRATS_449.nii.gz,./labelsTr/BRATS_449.nii.gz,train
4,./imagesTr/BRATS_318.nii.gz,./labelsTr/BRATS_318.nii.gz,train
...,...,...,...
261,./imagesTs/BRATS_564.nii.gz,,test
262,./imagesTs/BRATS_519.nii.gz,,test
263,./imagesTs/BRATS_568.nii.gz,,test
264,./imagesTs/BRATS_515.nii.gz,,test


# Statictics
## Libs

In [None]:
%cd /content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/libs

/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/libs


In [None]:
!pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 4.9 MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1


In [None]:
from libs.sitk_utils import *
from tqdm.auto import tqdm

In [None]:

file_path = f"{work_dir}/statistics/df_data.xlsx"
df_data = pd.read_excel(file_path, keep_default_na=False, na_values="")

print("df_data")
display(df_data.head())

df_data


Unnamed: 0,image,label,phase
0,./imagesTr/BRATS_457.nii.gz,./labelsTr/BRATS_457.nii.gz,train
1,./imagesTr/BRATS_306.nii.gz,./labelsTr/BRATS_306.nii.gz,train
2,./imagesTr/BRATS_206.nii.gz,./labelsTr/BRATS_206.nii.gz,train
3,./imagesTr/BRATS_449.nii.gz,./labelsTr/BRATS_449.nii.gz,train
4,./imagesTr/BRATS_318.nii.gz,./labelsTr/BRATS_318.nii.gz,train


In [None]:
def image_intensity_summary(data_root, df_data, global_scope = None, verbose = True, channel=False, **kwargs):
    stat_info = []
    data_column_names = list(df_data.keys())[::-1]
    for i in modality:
      for idx in tqdm(range(len(df_data)), "Image"):
          row_info = df_data.iloc[idx]
          name = row_info['image']
          image_file = os.path.abspath(os.path.join(data_root, name))
          
          image_sitk = sitk.ReadImage(image_file, sitk.sitkFloat32)
          image_info = sitk_get_image_info(image_sitk[:,:,:,int(i)], prefix = "", has_boxplot=True)
        
          for v in data_column_names:
              image_info[v] = row_info[v]       
              image_info.move_to_end(v, last=False)
        
          stat_info.append(image_info)
            
          if verbose == True and idx>2: 
              break
          
      df_stat = pd.DataFrame(stat_info)
      file_path = f"{work_dir}/statistics/{modality[i]}_image_stat.xlsx"
      print(f"{work_dir}/statistics/{modality[i]}_image_stat.xlsx")
      df_stat.to_excel(file_path, index = False)
df_image_stat = image_intensity_summary(data_dir, df_data, globals(), verbose = False)


Image:   0%|          | 0/750 [00:00<?, ?it/s]

/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/FLAIR_image_stat.xlsx


Image:   0%|          | 0/750 [00:00<?, ?it/s]

/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/T1w_image_stat.xlsx


Image:   0%|          | 0/750 [00:00<?, ?it/s]

/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/t1gd_image_stat.xlsx


Image:   0%|          | 0/750 [00:00<?, ?it/s]

/content/drive/MyDrive/mah_ws/ai_prj/3D_Tumor/private/stat/T2w_image_stat.xlsx


## Label Image Intensity


In [None]:
def label_intensity_summary(data_root, df_data, global_scope = None, verbose = True, channel=False, **kwargs):
    
    data_column_names = list(df_data.keys())[::-1]
    df_label = df_data.query('label!=""')
    for i in modality:
      stat_info = []
      for idx in tqdm(range(len(df_label)), "Label"):
          row_info = df_data.iloc[idx]
          
          if(type(row_info['label']) != type("a")):
            continue
          else:
            image_file = os.path.abspath(os.path.join(data_root, row_info['image']))
            label_file = os.path.abspath(os.path.join(data_root, row_info['label']))
            
            image_sitk = sitk.ReadImage(image_file, sitk.sitkFloat32)
            label_sitk = sitk.ReadImage(label_file, sitk.sitkUInt8)
            
            shape_info = sitk_label_intensity_info(image_sitk[:,:,:,int(i)], label_sitk)
            
            for v in data_column_names:
                shape_info[v] = row_info[v]       
                shape_info.move_to_end(v, last=False)
            
            stat_info.append(shape_info)
            
            if verbose == True and idx>=2: 
                break
            pass # for
        
      df_stat = pd.DataFrame(stat_info)
      file_path = f"{work_dir}/statistics/{modality[i]}_label_stat.xlsx"
      df_stat.to_excel(file_path, index = False)
    if global_scope is not None: global_scope.update(**locals())
    # return df_stat
    # pass


label_intensity_summary(data_dir, df_data, globals(), verbose = False)

Label:   0%|          | 0/750 [00:00<?, ?it/s]

Label:   0%|          | 0/750 [00:00<?, ?it/s]

Label:   0%|          | 0/750 [00:00<?, ?it/s]

Label:   0%|          | 0/750 [00:00<?, ?it/s]