# **RSNA-MICCAI Brain Tumor Radiogenomic Classification**
# DATA EXPLORATION

In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import matplotlib
import pydicom as dicom
import cv2
import ast
import warnings
from collections import Counter
import seaborn as sns
import pydicom
warnings.filterwarnings('ignore')

## **1. DATA PROCESSING**

First of all, we define the path variable and check files and folders :

In [None]:
path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/'
train_path = os.path.join(path, 'train')
test_path = os.path.join(path, 'test')

flair_dir = 'FLAIR'
t1w_dir = 'T1w'
t1wce_dir = 'T1wCE'
t2w_dir = 'T2w'

os.listdir(path)

It's time to import data :

In [None]:
ldf = pd.read_csv(path + 'train_labels.csv')
ldf.head(5)

Let's display some stats :

In [None]:
ldf.groupby('MGMT_value').count()

We can see that the data set is relatively well balanced.

Let's define some functions to retreive full path from Id.<br>
We take this opportunity to check is subdirectories are created for all Id : 

In [None]:
def getFullId(id):
    return str(id).zfill(5)
    
def getFlairPath(id):
    flair_path = os.path.join(train_path, getFullId(id), flair_dir)
    return flair_path if os.path.isdir(flair_path) else False

def getT1wPath(id):
    t1w_path = os.path.join(train_path, getFullId(id), t1w_dir)
    return t1w_path if os.path.isdir(t1w_path) else False

def getT1wcePath(id):
    t1wce_path = os.path.join(train_path, getFullId(id), t1wce_dir)
    return t1wce_path if os.path.isdir(t1wce_path) else False

def getT2wPath(id):
    t2w_path = os.path.join(train_path, getFullId(id), t2w_dir)
    return t2w_path if os.path.isdir(t2w_path) else False

print('Missing FLAIR directories number = ', ldf['BraTS21ID'].apply(lambda x: getFlairPath(x)).tolist().count(False))
print('Missing T1w directories number = ', ldf['BraTS21ID'].apply(lambda x: getT1wPath(x)).tolist().count(False))
print('Missing T1wCE directories number = ', ldf['BraTS21ID'].apply(lambda x: getT1wcePath(x)).tolist().count(False))
print('Missing T2w directories number = ', ldf['BraTS21ID'].apply(lambda x: getT2wPath(x)).tolist().count(False))

Everything seems to be ok, let's count files for each subdirectory :

In [None]:
def countFlairFiles(id):
    path = getFlairPath(id)
    return len([file for file in os.listdir(path)])

def countT1wFiles(id):
    path = getT1wPath(id)
    return len([file for file in os.listdir(path)])

def countT1wceFiles(id):
    path = getT1wcePath(id)
    return len([file for file in os.listdir(path)])

def countT2wFiles(id):
    path = getT2wPath(id)
    return len([file for file in os.listdir(path)])

ldf['FLAIR'] = ldf['BraTS21ID'].apply(lambda x: countFlairFiles(x))
ldf['T1w'] = ldf['BraTS21ID'].apply(lambda x: countT1wFiles(x))
ldf['T1wCE'] = ldf['BraTS21ID'].apply(lambda x: countT1wceFiles(x))
ldf['T2w'] = ldf['BraTS21ID'].apply(lambda x: countT2wFiles(x))

In [None]:
sns.boxplot(x="variable", y="value", data=pd.melt(ldf[['FLAIR', 'T1w', 'T1wCE', 'T2w']]))
plt.title('Number of images files by structural multi-parametric MRI')
plt.show()

It seems that there is some disparity between the Ids, it is perhaps on this point that we will have problems inherent to unbalanced datasets.

## **2. IMAGES / VIDEO**

We first create a function to convert dicom file to 3 channel image

In [None]:
def get3ScaledImage(path):

    dicom = pydicom.read_file(path)
    img = dicom.pixel_array

    r, c = img.shape
    img_conv = np.empty((c, r, 3), dtype=img.dtype)
    img_conv[:,:,2] = img_conv[:,:,1] = img_conv[:,:,0] = img

    ## Step 1. Convert to float to avoid overflow or underflow losses.
    img_2d = img_conv.astype(float)

    ## Step 2. Rescaling grey scale between 0-255
    img_2d_scaled = (np.maximum(img_2d,0) / img_2d.max()) * 255.0

    ## Step 3. Convert to uint
    img_2d_scaled = np.uint8(img_2d_scaled)
    img_2d_scaled.reshape([img_2d_scaled.shape[0], img_2d_scaled.shape[1], 3])
    
    return img_2d_scaled, (c, r)



Then we generate a video clip by reading file in sequence.
Here is an example for the first Id (=0) and FLAIR structure.

Generated video is saved as mp4 file in th working directory. Feel free to download it to visualize the sequence in action.

In [None]:
id = 0

nb = countFlairFiles(id)
path = getFlairPath(id)
frames =[]

for i in range(nb):
    file_name = 'Image-' + str(i+1) + '.dcm'
    img_path = os.path.join(path, file_name)
    img_2d_scaled, size = get3ScaledImage(img_path)
    frames.append(img_2d_scaled)


out = cv2.VideoWriter('/kaggle/working/video.mp4', 0x7634706d, 15, size)
for i in range(len(frames)):
    out.write(frames[i])
    
out.release()



You can also visualize the sequence as below :

In [None]:
import matplotlib.animation as animation

from matplotlib import animation, rc

rc('animation', html='jshtml')

def create_animation(ims):
    ims = ims
    fps = 1
    nSeconds = 10

    fig = plt.figure( figsize=(9,9) )

    a = ims[0]
    im = plt.imshow(a)

    def animate_func(i):
        im.set_array(ims[i])
        return [im]

    anim = animation.FuncAnimation(fig, animate_func, frames = len(ims), interval = 1000//24)
    
    return anim

video = create_animation(frames)
video

Let's plot somme slices :

In [None]:
def plot_slices(num_rows, num_columns, size, data):

    fig_width = 20
    fig_height = (fig_width / num_columns) / (size[1] / size[0]) * num_rows
    
    fig, ax = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(fig_width, fig_height))
    for i in range(num_rows):
        for j in range(num_columns):
            ax[i, j].imshow(data[i*num_columns+j], cmap="gray")
            ax[i, j].axis("off")
    plt.subplots_adjust(wspace=0, hspace=0, left=0, right=1, bottom=0, top=1)
    plt.show()

id = 5

nb_flair = countFlairFiles(id)
path_flair = getFlairPath(id)
frames_flair = []

for i in range(nb_flair):
    file_name = 'Image-' + str(i+1) + '.dcm'
    img_path = os.path.join(path_flair, file_name)
    img_2d_scaled, size = get3ScaledImage(img_path)
    frames_flair.append(img_2d_scaled)
    
plot_slices(5, 6, size, frames[90:131])

Feel free to test this notebook, and upvote ...