<h1>RSNA-MICCAI Brain Tumor Radiogenomic Classification</h1>


<h3>Predict the status of a genetic biomarker important for brain cancer treatment</h3>

In [None]:
# import library
import os
import tqdm
import numpy as np
import pandas as pd
import pydicom # for DICOM images
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the Dataset
IMAGE_PATH = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
# First Five training data
train_df.head()

In [None]:
print (f"Train has {train_df.shape[0]} rows and {train_df.shape[1]} columns")
image_files = list(os.listdir(IMAGE_PATH))
print("Number of image files: {}".format(len(image_files)))

In [None]:
# Function for Calculating missing data ratio in feature columns
def missing_ratio(data_df):
    data_mis = 100 * data_df.isnull().sum() / len(data_df)
    data_mis = data_mis.drop(data_mis[data_mis == 0].index).sort_values(ascending=False).round(1)
    data_mis = pd.DataFrame({'Percentage' :data_mis})
    data_mis['Columns'] = data_mis.index
    data_mis.reset_index(drop=True,level=0, inplace=True)
    # Print some summary information
    print ("Your selected dataframe has " + str(data_df.shape[1]) + " columns.\n"      
            "There are " + str(data_mis.shape[0]) +
              " columns that have missing values.")
        
    return data_mis#.head()

In [None]:
# calculate percentage of missing data in training dataset
train_mis = missing_ratio(train_df)

# DICOM Data
### Now let's explore the .dcm files we were provided and to extract insights about it.

In [None]:
# Count total number of files in each subdirectory in train and test

# Images Path
train_path = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/"
test_path = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/"

# --- TRAIN

train_dcm = 0

# dirpath - the directory path in string
# dirnames - all main directories
# filenames - all subdirectories

for dirpath, dirnames, filenames in tqdm.tqdm(os.walk(train_path)):
    train_dcm += len(filenames)
        
# --- TEST

test_dcm = 0

for dirpath, dirnames, filenames in tqdm.tqdm(os.walk(test_path)):
    test_dcm += len(filenames)

In [None]:
print("Train: total .dcm files - {:,}".format(train_dcm), "\n" +
      "Test: total .dcm files - {:,}".format(test_dcm))

# Visualize a DICOM image

In [None]:
def load_dicom(path):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

path = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-113.dcm"
data = load_dicom(path)
plt.figure(figsize = (5, 5))
plt.imshow(data,cmap="gray")
plt.axis('off');


 # Visualize a set of images for a Study

In [None]:
# Study "T1wCE"
study_dir = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00003/T1wCE/"
datasets = []

# Read in the Dataset
for dcm in os.listdir(study_dir):
    path = study_dir + "/" + dcm
    datasets.append(pydicom.dcmread(path))

In [None]:
# Plot the images
fig=plt.figure(figsize=(16, 6))
columns = 10
rows = 3

for i in range(1, columns*rows +1):
    img = datasets[i-1].pixel_array
    fig.add_subplot(rows, columns, i)
    plt.imshow(img, cmap="gray")
    plt.title(i, fontsize = 9)
    plt.axis('off');