In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

# Combining to a Single DataFrame

Okay Yeah!    
Let me be pretty honest about this : I want **ONE** DataFrame.  
Just One Good Big DF that has everything in it and I don't have to fumble accross all dfs to lookup again.    
So let's just get on with it!

In [None]:
sub_pth = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv"
df_sub = pd.read_csv(sub_pth)
df_sub.head()

### Get Paths : Directory Structure

```
Test/Train Directory
    Study_ID_Number - 00001, ....
        FLAIR
        T1w
        T1wCE
        T2w
            Image-{Img_ID}.dcm
```

In [None]:
PARENT_DIRS = ["test","train"]
CHILD_DIRS = ["FLAIR", "T1w","T1wCE","T2w"]

Split_Types = []
Study_IDs = []
Tumour_Types = []
Image_IDs = []
Absolute_Paths = []
IMG_FORMAT = ".dcm"

File Absolute Paths will be of the following format :
```
Split_Type/Study_ID/Tumour_Type/Image_ID.dcm
```

In [None]:
def splitall(path):
    # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s16.html
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts
splitall("Split_Type/Study_ID/Tumour_Type/Image_ID.dcm")

In [None]:
file_count = 400*1000 # from kaggle files counter
DATA_FOLDER = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'

with tqdm(total=file_count) as pbar:
    for path, directories, files in os.walk(DATA_FOLDER):
         for file in files:
                if file.endswith(IMG_FORMAT):
                    pbar.update(1)
                    abs_path = os.path.join(path, file)
                    Image_ID = os.path.basename(abs_path)
                    Splitted_Path = splitall(abs_path)
                    Tumour_Type =  Splitted_Path[-2]
                    Study_ID = Splitted_Path[-3]
                    Split_Type = Splitted_Path[-4]

                    Split_Types.append(Split_Type)
                    Study_IDs.append(Study_ID)
                    Tumour_Types.append(Tumour_Type)
                    Image_IDs.append(Image_ID)
                    Absolute_Paths.append(abs_path)

In [None]:
df_ext = pd.DataFrame.from_dict({"Split_Type":Split_Types,
                             "Study_ID":Study_IDs,
                             "Tumour_Type":Tumour_Types,
                             "Image_ID": Image_IDs,
                             "Absolute_Path":Absolute_Paths})
df_ext.head()

In [None]:
train_pth = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv"
df_train = pd.read_csv(train_pth)
df_train.head()

In [None]:
df_train.shape

In [None]:
train_df = df_ext[df_ext["Split_Type"]=="train"]
train_df.shape

In [None]:
set_1 = set(list(df_train["BraTS21ID"]))
set_2 = set(list(map(int,train_df["Study_ID"])))

set_1==set_2

### Insert Ground Truth Values into the DataFrame

In [None]:
df_ext["Ground_Truth_Class"] = -1

In [None]:
for index, row in tqdm(df_ext.iterrows(),total=400114):
    i = 0
    # print(int(row['Study_ID']))
    study_id = int(row['Study_ID'])
    single_df = df_train[df_train["BraTS21ID"] == study_id]
    # print(single_df.shape)
    if not single_df.shape[0]==0:
        label_val = single_df["MGMT_value"].values.flatten()[0]
        df_ext.loc[index,"Ground_Truth_Class"] = label_val

In [None]:
df_train["MGMT_value"].value_counts()

In [None]:
df_ext["Ground_Truth_Class"].value_counts()

### `len(Images)` per Study

In [None]:
images_per_study = df_ext.groupby(['Study_ID']).size()
images_per_study

# Data Insights & Visualizations

In [None]:
df_ext.shape

In [None]:
df_ext.describe()

In [None]:
df_ext.info()

### More About Columns

In [None]:
df_ext.head()

In [None]:
train_df = df_ext[df_ext["Split_Type"]=="train"]
train_df.shape

In [None]:
import seaborn as sns

In [None]:
def get_count_viz(data, column,title = "Distribution Count",figure_size= (20,4)):
    print("Absolute Value Counts :")
    print(data[column].value_counts())
    print("Normalized (Percentage) Value Counts :")
    print(data[column].value_counts(normalize=True))
    plt.figure(figsize=figure_size)
    ax = sns.countplot(data=data, y=column)
    ax.set_title(title)
    plt.show()

In [None]:
get_count_viz(data = train_df, column = "Ground_Truth_Class",title = "Label Distribution Count")

In [None]:
get_count_viz(data = df_ext, column = "Split_Type",title = "Train/Test Distribution Count")

### Visualize How Many Images Per Study

In [None]:
per_study = dict(images_per_study)

In [None]:
plt.figure(figsize=(20,12))
plt.plot(list(per_study.keys()), list(per_study.values()))
plt.show()

### Distribution Plot
Well, That's a bit too ragged, let's try breaking it down to a frequency-range plot

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = list(per_study.values())
plt.figure(figsize=(20,12))

sns.distplot(data,bins="doane",kde=True,hist_kws={"align" : "left"})
plt.show()

### Helper Functions

We need these functions to preprocess the dcm images to the array because of which we use the `pydicom API`. This uses the `gdcm` library, which has to be installed beforehand.  
However using this tool isn't easy because
1. Pydicom isn't previously installed on Kaggle Notebooks.
2. For Submissions, we need notebooks that run offline, so we can't use the internet to perform `pip install`.

**The Solution** - Using Offline Installation as Dataset from a different Notebook. This approach is being used here.

In [None]:
from tqdm import tqdm

# Pydicom related imports
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

# Reference: https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px
# and https://www.kaggle.com/ayuraj/brain-tumor-eda-and-interactive-viz-with-w-b
def ReadMRI(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
np.random.seed(42)

In [None]:
def props(img):
    print("Shape :",img.shape,"Maximum :",img.max(),"Minimum :",img.min())

def view_data(sample_set,label_col="Study_ID",path_col = "Absolute_Path",figure_size = (20,20),ht = 5,wd = 4):
    n = ht * wd
    fig, axs = plt.subplots(wd, ht, figsize=figure_size)
    fig.subplots_adjust(hspace=.2, wspace=.2)
    axs = axs.ravel()
    sample_set = sample_set.reindex(np.random.permutation(sample_set.index))
    sample_set.reset_index(drop=True, inplace=True)
    i = 0
    plots_done = 0
    while plots_done<20:
    # for i in range(n):
        img_path = sample_set.loc[i,path_col]
        img =  ReadMRI(img_path)  
        # props(img)
        if not img.max()==0:
            axs[plots_done].imshow(img,cmap=plt.cm.gist_ncar)
            axs[plots_done].set_title(sample_set.loc[i,label_col])
            plots_done+=1
        i+=1

In [None]:
train_df = df_ext[df_ext["Split_Type"]=="train"]
train_df.shape

# View MRI Scans

### Train Set - Positive Classes

In [None]:
view_data(train_df[train_df["Ground_Truth_Class"]==1])

### Train Set - Negative Classes

In [None]:
view_data(train_df[train_df["Ground_Truth_Class"]==0])

### Test Set

In [None]:
view_data(df_ext[df_ext["Ground_Truth_Class"]==-1])

In [None]:
df_ext.columns

In [None]:
data_path = df_ext.loc[0,'Absolute_Path']
data_path

In [None]:
data = ReadMRI(data_path)
print('Shape of data: ', data.shape)

plt.figure(figsize=(5, 5))
plt.imshow(data, cmap=plt.cm.gist_ncar);

In [None]:
df_ext.to_csv('Extracted_Study_Series_Img.csv',index=False)