#### In this kernel, I perform some basic EDA. The idea is to understand the **statistics of the train** data to help us choose the best hyperparameters for our models.

# **Getting Started**
---
* Load libraries and modules
* Peak at train data

In [None]:
import os
import glob
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

import pydicom

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
print("Overview of data:\n")
display(train_df)
print(f"Total number of train samples: {train_df.shape[0]}")

In [None]:
plt.figure(figsize = (8, 5))
plt.title("MGMT_value (Target) Distribution", fontsize = 16)

ax = sns.countplot(data = train_df, x = "MGMT_value", order = [0, 1]);
ax.set_xlabel(xlabel = "MGMT_value", fontsize = 12)
ax.set_ylabel(ylabel = "Count", fontsize = 12)

for p in ax.patches:
    x = p.get_bbox().get_points()[:, 0]
    y = p.get_bbox().get_points()[1, 1]
    ax.annotate(f"{int(y):,} ({100*y/train_df.shape[0]:.1f}%)", (x.mean(), y),
                ha = "center", va = "bottom")

# **Images**
---
* See some smaple images
* Look at some **key statistics** for out train images


In [None]:
def load_dicom(path):
    """
    Input: path to a dicom file
    Output: Image in numpy format
    Image is min-max normalised
    """
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data


def visualize_sample(brats21id, slice_i, mgmt_value, types = ("FLAIR", "T1w", "T1wCE", "T2w")):
    """
    For a given id in train data, plots the
    first image of each of the four types
    """
    plt.figure(figsize = (16, 5))
    patient_path = os.path.join(
        "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/", 
        str(brats21id).zfill(5),
    )
    
    for i, t in enumerate(types, 1):
        t_paths = sorted(glob.glob(os.path.join(patient_path, t, "*")),
                         key = lambda x: int(x[:-4].split("-")[-1]))
        data = load_dicom(t_paths[int(len(t_paths) * slice_i)])
        
        plt.subplot(1, 4, i)
        plt.imshow(data, cmap = "gray")
        plt.title(f"{t}", fontsize = 16)
        plt.axis("off")
    plt.suptitle(f"id: {_brats21id}  MGMT_value: {mgmt_value}", fontsize = 16)
    plt.show()

In [None]:
for i in random.sample(range(train_df.shape[0]), 2):
    _brats21id = train_df.iloc[i]["BraTS21ID"]
    _mgmt_value = train_df.iloc[i]["MGMT_value"]
    visualize_sample(brats21id = _brats21id, mgmt_value = _mgmt_value, slice_i = 0.5)

In [None]:
FILES = glob.glob("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/*/*/*")
print(f"Total number of Image files: {len(FILES):,}")

In [None]:
files_dict = {"FLAIR": [], "T1w": [], "T1wCE": [], "T2w": []}
for filename in FILES:
    scan = filename.split("/")[-2]
    if scan == "FLAIR":
        files_dict["FLAIR"].append(filename)
    elif scan == "T1w":
        files_dict["T1w"].append(filename)
    elif scan == "T1wCE":
        files_dict["T1wCE"].append(filename)
    else:
        files_dict["T2w"].append(filename)
        
keys = list(files_dict.keys())
vals = [len(files_dict[k]) for k in keys]

plt.figure(figsize = (12, 5))
plt.title("Total files vs MRI type", fontsize = 16)

ax = sns.barplot(x = keys, y = vals)
ax.set_xlabel(xlabel = "Type", fontsize = 12)
ax.set_ylabel(ylabel = "Total Count", fontsize = 12)

for p in ax.patches:
    x = p.get_bbox().get_points()[:, 0]
    y = int(p.get_bbox().get_points()[1, 1])
    ax.annotate(f"{y:,} ({100*y/len(FILES):.1f}%)", (x.mean(), y),
                ha = "center", va = "bottom")

In [None]:
def plot_dim_freq(type_name):
    """
    type_name = ["FLAIR", "T1w", "T2w", "T1wCE"]
    Plots frequency distribution of dimensions
    of images for the given "type_name"
    """
    def image_size(path):
        dicom = pydicom.read_file(path)
        data = dicom.pixel_array
        return data.shape
    
    img_size_dict = dict()
    for fpath in tqdm(files_dict[type_name]):
        dims = image_size(fpath)
        key = str(dims[0]) + "x" + str(dims[1])
        if key in img_size_dict:
            img_size_dict[key] += 1
        else:
            img_size_dict[key] = 1
    img_size_dict = {k: v for k, v in sorted(img_size_dict.items(), key = lambda item: -1*item[1])}
    
    keys = list(img_size_dict.keys())
    vals = [img_size_dict[k] for k in keys]
    
    plt.figure(figsize = (24, 5))
    plt.title(f"Image dimension Distribution for \"{type_name}\" (top 10)", fontsize = 16)
    
    ax = sns.barplot(x = keys[:10], y = vals[:10], order = keys[:10])
    ax.set_xlabel(xlabel = "Dimensions", fontsize = 12)
    ax.set_ylabel(ylabel = "Count", fontsize = 12)
    
    for p in ax.patches:
        x = p.get_bbox().get_points()[:, 0]
        y = int(p.get_bbox().get_points()[1, 1])
        ax.annotate(f"{y:,} ({100*y/len(files_dict[type_name]):.1f}%)", (x.mean(), y),
                    ha = "center", va = "bottom", fontsize = 8)
    plt.show()

In [None]:
plot_dim_freq("FLAIR")

In [None]:
plot_dim_freq("T2w")

In [None]:
plot_dim_freq("T1w")

In [None]:
plot_dim_freq("T1wCE")

In [None]:
def plot_num_files(type_name):
    """
    type_name = ["FLAIR", "T1w", "T2w", "T1wCE"]
    Plots frequency distribution of number of files
    for a patient for the given "type_name"
    """
    all_freqs = []
    num_files_dict = dict()
    for i, data in train_df.iterrows():
        brats21id = data[0]
        patient_path = os.path.join(
            "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/", 
            str(brats21id).zfill(5),
        )
        
        key = len(glob.glob(os.path.join(patient_path, type_name, "*")))
        all_freqs.append(key)
        if key in num_files_dict:
                num_files_dict[key] += 1
        else:
            num_files_dict[key] = 1

    num_files_dict = {k: v for k, v in sorted(num_files_dict.items(), key = lambda item: -1*item[1])}
    keys = list(num_files_dict.keys())
    vals = [num_files_dict[k] for k in keys]
    
    plt.figure(figsize = (24, 5))
    plt.title(f"Number of images for type \"{type_name}\" per Patient distribution (top 10)", fontsize = 16)
    
    ax = sns.barplot(x = keys[:10], y = vals[:10], order = keys[:10])
    ax.set_xlabel(xlabel = "Number of files", fontsize = 12)
    ax.set_ylabel(ylabel = "count", fontsize = 12)
    
    for p in ax.patches:
        x = p.get_bbox().get_points()[:, 0]
        y = int(p.get_bbox().get_points()[1, 1])
        ax.annotate(f"{y:,} ({100*y/train_df.shape[0]:.2f}%)", (x.mean(), y),
                    ha = "center", va = "bottom", fontsize = 10)
    plt.show()
    
    print(f"Total ids in top 10: {sum(vals)} ({1.0*sum(vals[:10])/train_df.shape[0]:.2f}%)")
    print(f"Minimum and Maximum number of files for a patient is {min(keys)} and {max(keys)}, respectively.")
    print(f"Mean and Median of number of files is: {np.mean(all_freqs):.2f} and {np.median(all_freqs):.0f}, respectively.")
    print(f"Varinace of number of files is {np.sqrt(np.var(all_freqs)):.2f}")

In [None]:
plot_num_files("FLAIR")

In [None]:
plot_num_files("T2w")

In [None]:
plot_num_files("T1w")

In [None]:
plot_num_files("T1wCE")

# **Final Thoughts**
* **FLAIR and T2w have very similar image data statistics**
* **T1w and T1wCE have very similar image data statistics**
* Number of images for a patient per type differ significantly
* Images have large blank areas on all sides


# **.... Work in Progress**