In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
import glob
import random

In [None]:
!cp /kaggle/input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

In [None]:
data_input_path = "../input/rsna-miccai-brain-tumor-radiogenomic-classification"

In [None]:
df_train = pd.read_csv(os.path.join(data_input_path, "train_labels.csv"))

# EDA

In [None]:
df_train.info()

In [None]:
print(f"The total patient ids are {df_train['BraTS21ID'].count()}, from those the unique ids are {df_train['BraTS21ID'].value_counts().shape[0]} ")

## Data Distribution

In [None]:
sns.set()

plt.figure(figsize = (5,5))
# Count the number of images per category
sns.countplot(x = 'MGMT_value', color = '#169DE3',data = df_train)

plt.title('Categories Distribution'.title(),size=22 , color = '#169DE3')
plt.xlabel('MGMT_value',size=17 , color = '#169DE3')
plt.ylabel('Count',size=17 , color = '#169DE3')

plt.show()

## Data Vis

In [None]:
import pydicom

### Images

In [None]:
#This is from: https://www.kaggle.com/ihelon/brain-tumor-eda-with-animations-and-modeling
def load_dicom(path):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

def visualize_sample(
    brats21id, 
    slice_i,
    mgmt_value,
    types=("FLAIR", "T1w", "T1wCE", "T2w")
):
    plt.figure(figsize=(16, 5))
    patient_path = os.path.join(
        os.path.join(data_input_path, "train"), 
        str(brats21id).zfill(5),
    )
    for i, t in enumerate(types, 1):
        t_paths = sorted(
            glob.glob(os.path.join(patient_path, t, "*")), 
            key=lambda x: int(x[:-4].split("-")[-1]),
        )
        data = load_dicom(t_paths[int(len(t_paths) * slice_i)])
        plt.subplot(1, 4, i)
        plt.imshow(data, cmap="gray")
        plt.title(f"{t}", fontsize=16)
        plt.axis("off")

    plt.suptitle(f"MGMT_value: {mgmt_value}", fontsize=16)
    plt.show()

In [None]:
for i in random.sample(range(df_train.shape[0]), 5):
    _brats21id = df_train.iloc[i]["BraTS21ID"]
    _mgmt_value = df_train.iloc[i]["MGMT_value"]
    visualize_sample(brats21id=_brats21id, mgmt_value=_mgmt_value, slice_i=0.5)

### Animations

In [None]:
##This is from: https://www.kaggle.com/ihelon/brain-tumor-eda-with-animations-and-modeling
from matplotlib import animation, rc
rc('animation', html='jshtml')


def create_animation(ims):
    fig = plt.figure(figsize=(6, 6))
    plt.axis('off')
    im = plt.imshow(ims[0], cmap="gray")

    def animate_func(i):
        im.set_array(ims[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames = len(ims), interval = 1000//24)

def load_dicom_line(path):
    t_paths = sorted(
        glob.glob(os.path.join(path, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    images = []
    for filename in t_paths:
        data = load_dicom(filename)
        if data.max() == 0:
            continue
        images.append(data)
        
    return images

In [None]:
images = load_dicom_line(os.path.join(data_input_path,"train/00000/FLAIR"))
create_animation(images)

In [None]:
images = load_dicom_line(os.path.join(data_input_path,"train/00000/T1w"))
create_animation(images)

In [None]:
images = load_dicom_line(os.path.join(data_input_path,"train/00000/T1wCE"))
create_animation(images)

In [None]:
images = load_dicom_line(os.path.join(data_input_path,"train/00000/T2w"))
create_animation(images)

# Convert train data to PNG data

In [None]:
from PIL import Image

In [None]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [None]:
def save_dcm_as_png(source, dest, mode = "train", size = 512):
    image = load_dicom(source)

    #orig_shapes[mode].append((image.shape[1], image.shape[0]))
    
    image = resize(image, size)
    image.save(dest)

In [None]:
def convert_to_png(mode = "train", size = 512):
    root_path = os.path.join(data_input_path, mode)
    
    for patient_folder in os.listdir(root_path):
        patient_folder_path = os.path.join(root_path, patient_folder)
        
        for imagery_type in os.listdir(patient_folder_path):
            imagery_type_path = os.path.join(patient_folder_path, imagery_type)
            output_path = "{}/{}/{}".format(mode, imagery_type, patient_folder)
            os.makedirs(output_path, exist_ok = True)
            
            for image in os.listdir(imagery_type_path):
                image_path = os.path.join(imagery_type_path, image)
                save_dcm_as_png(image_path,
                               os.path.join(output_path, 
                                            image_path.split("/")[-1][:-3] + "png"),
                               mode = mode)

In [None]:
convert_to_png()

In [None]:
#check
mode = "train"
root_path = os.path.join(data_input_path, mode)

for patient_folder in os.listdir(root_path):
    patient_folder_path = os.path.join(root_path, patient_folder)

    for imagery_type in os.listdir(patient_folder_path):
        imagery_type_path = os.path.join(patient_folder_path, imagery_type)
        output_path = "{}/{}/{}".format(mode, imagery_type, patient_folder)

        assert len(os.listdir(imagery_type_path)) == len(os.listdir(output_path))

In [None]:
!cp ../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv ./

In [None]:
!zip -r dataset_btrc.zip train train_labels.csv

<a href="dataset_btrc.zip"> Download File </a>