# TODO: explain the use of different of windows in Xray

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import os
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [None]:
sys.path.insert(0, '../input/kaggledicom')

In [None]:
from collections import namedtuple  
from src.utils import misc
from src.preprocess.dicom_to_dataframe import create_record

In [None]:
import pydicom
import seaborn as sns

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

In [None]:
DATA_ROOT = "../input/vinbigdata-chest-xray-abnormalities-detection/"
TRAIN_DIR = os.path.join(DATA_ROOT, "train")
TEST_DIR = os.path.join(DATA_ROOT, "test")

In [None]:
colorpal = ['red', 'green', 'blue']

# Utils function

In [None]:
def plot_groupby_info(df):
    image_group = []
    for image_id, pd_frame in df.groupby('image_id'):
        image_group.append(image_id)

    imageAncClass_group = []
    for pair, pd_frame in df.groupby(['image_id', 'class_name']):
        imageAncClass_group.append(pair)
    print(f"Considering the data with no finding")
    print(f"length of number of image_id vs length of image_id and class_name = {len(image_group), len(imageAncClass_group)}")
    return len(imageAncClass_group)/len(image_group)

# Train dataset
## Ratio of different class
## total number of image in train dataset = 15000
## total number of image in public test dataset = 3000

## Number of class = 15

In [None]:
train_images = os.listdir(TRAIN_DIR)
print(f"number of image in public test = {len(train_images)}")

test_images = os.listdir(TEST_DIR)
print(f"number of image in public test = {len(test_images)}")

In [None]:
train_df = pd.read_csv(os.path.join(DATA_ROOT,'train.csv')).fillna(-1)

train_df.head()

In [None]:
print(f"number of box per image including image with no finding = {plot_groupby_info(train_df)}")

In [None]:
print(f"total number of row in train dataset {len(train_df)}")

In [None]:
print(f"number of class = {len(train_df['class_name'].unique())}")

In [None]:
train_df.class_name.value_counts()/len(train_df)

In [None]:
train_df.class_name.value_counts()\
    .plot(kind='bar',
          title='class_name',
          figsize=(12, 4),
          color=colorpal[0])

# Check size of bounding box

In [None]:
abnormal_df = train_df[train_df.class_name!= 'No finding']

In [None]:
abnormal_df.head()

In [None]:
abnormal_df['w'] = abnormal_df['x_max'].copy() - abnormal_df['x_min'].copy()
abnormal_df['h'] = abnormal_df['y_max'] - abnormal_df['y_min']

In [None]:
abnormal_df['area'] = abnormal_df['w']*abnormal_df['h']

In [None]:
abnormal_df.head()

# Average number of class_name and bboxes in an abnormal image
## From the line below there are approximately (3.5 class_name) per abnormal image
## AND 8.2 abnormal bboxes per abnormal image
### Hence one image can contain multiple box of different type

In [None]:
print(f"number of box in train dataset/ abnormal df = {len(abnormal_df)} boxes")

In [None]:
print(f"number of box per image without image with no finding = {plot_groupby_info(abnormal_df)}")

In [None]:
print(f"number of average box per abnormal image = {36096/4394}")

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
sns.distplot(abnormal_df['area'].value_counts(),
             bins=15,
             color=colorpal[1])
ax.set_title('Distribution bounding box sizes')
plt.show()

## helper function to organise dicom data and read image from path

In [None]:
def window_imgs_from_dicom(id, dirname):
    
    record = {
        'ID': id,
#         'labels': ' '.join(labels),
#         'n_label': len(labels),
    }
    
    
    path = '%s/%s.dicom' % (dirname, id)
    dicom = pydicom.dcmread(path)
    record.update(misc.get_dicom_raw(dicom))
    
    raw = dicom.pixel_array
    try:
        slope = float(record['RescaleSlope'])
        intercept = float(record['RescaleIntercept'])

        center = misc.get_dicom_value(record['WindowCenter'])
        width = misc.get_dicom_value(record['WindowWidth'])

        bits= record['BitsStored']
        pixel = record['PixelRepresentation']

#         print(center, width, bits, pixel)
        image = misc.rescale_image(raw, slope, intercept, bits, pixel)

        doctor = misc.apply_window(image, center, width)
        brain = misc.apply_window(image, 40, 80)
        return raw, image, doctor, brain, record
    except:
        return raw,raw,raw,raw, record
    
    
def create_record(id, dirname):

    raw, image, doctor, brain, record = window_imgs_from_dicom(id, dirname)

    record.update({
        'raw_max': raw.max(),
        'raw_min': raw.min(),
        'raw_mean': raw.mean(),
        'raw_diff': raw.max() - raw.min(),
        'doctor_max': doctor.max(),
        'doctor_min': doctor.min(),
        'doctor_mean': doctor.mean(),
        'doctor_diff': doctor.max() - doctor.min(),
        'brain_max': brain.max(),
        'brain_min': brain.min(),
        'brain_mean': brain.mean(),
        'brain_diff': brain.max() - brain.min(),
        'brain_ratio': misc.get_windowed_ratio(image, 40, 80),
    })
    return record


In [None]:
# Scan has different size

In [None]:
# ../input/vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom
# ../input/vinbigdata-chest-xray-abnormalities-detection/train/0059d21bef1793fa9522e4ec8cae1a1a.dicom
raw, image, doctor, brain, record = window_imgs_from_dicom('000434271f63a053c4128a0ba6352c7f',TRAIN_DIR)
print(f"shape of individual scan = {raw.shape}")

In [None]:
# ../input/vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom
# ../input/vinbigdata-chest-xray-abnormalities-detection/train/0059d21bef1793fa9522e4ec8cae1a1a.dicom
raw, image, doctor, brain, record = window_imgs_from_dicom('0059d21bef1793fa9522e4ec8cae1a1a',TRAIN_DIR)
print(f"shape of individual scan = {raw.shape}")

## more detail about abnormal

In [None]:
abnormal_df.describe()

# Plot sample image of different class

In [None]:
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
# dataiter = iter(trainloader)
# images, labels = dataiter.next()

# show images
# imshow(torchvision.utils.make_grid(images))
# print labels
# print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

In [None]:
record = create_record('000434271f63a053c4128a0ba6352c7f', TRAIN_DIR)

In [None]:
windows_img = window_imgs_from_dicom('50a418190bc3fb1ef1633bf9678929b3', TRAIN_DIR)

In [None]:
def get_sample_images(df):
    classes = df['class_name'].unique()
    sample = []
    for i in classes:
        image_id = df[df.class_name == i].iloc[0].image_id
#         print(image_id, i)
        eg = namedtuple('id_img_pair',['id','image_id', 'window_images'])  
        image_list = window_imgs_from_dicom(image_id, TRAIN_DIR)
        
        raw, image, doctor, brain, _ = image_list 
        sample.append(eg(i, image_id, [raw, image, doctor, brain]))
    return sample

In [None]:
samples = get_sample_images(train_df)

# Plot sample image of every class 
## (15 class as 15 rows)
## (4 columns as 4 type of windows)

In [None]:
import operator
from functools import reduce #python 3


def imshow(img):
#     img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# imshow(torchvision.utils.make_grid(torch.tensor(images_)))

def plot_img_list(img_list):
    # settings
    h, w = 10, 10        # for raster image
    nrows, ncols = 1, 4  # array of sub-plots
    figsize = [6, 8]     # figure size, inches

    # prep (x,y) for extra plotting on selected sub-plots
    xs = np.linspace(0, 2*np.pi, 60)  # from 0 to 2pi
    ys = np.abs(np.sin(xs))           # absolute of sine

    # create figure (fig), and array of axes (ax)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
#     img_list = sample_img_list[0:4]
    # plot simple raster image on each sub-plot
    for i, axi in enumerate(ax.flat):
        # i runs from 0 to (nrows*ncols-1)
        # axi is equivalent with ax[rowid][colid]
        img = img_list[i]

        axi.imshow(img, cmap=plt.cm.bone)
        # get indices of row/column
        rowid = i // ncols
        colid = i % ncols
        # write row/col indices as axes' title for identification
        axi.set_title("Row:"+str(rowid)+", Col:"+str(colid))

    # one can access the axes by ax[row_id][col_id]
    # do additional plotting on ax[row_id][col_id] of your choice

    plt.tight_layout(True)
    plt.show()

def plot_image_windows(df):
    samples = get_sample_images(df)
    
    # settings
    h, w = 5, 5        # for raster image
    
    figsize = [30, 30]     # figure size, inches
    
    ncols = 4
    assert ncols == len(samples[0].window_images), 'incorrect number of window images'
    
    nrows = 15
    assert nrows == len(samples), 'incorrect number of class in samples'
    img_list = [ sample.window_images for sample in samples ]
    # flatten list of list of images   
    
    img_list = reduce(operator.concat, img_list)
    assert len(img_list) == ncols*nrows, 'incorrect number of class in samples'
    

    # prep (x,y) for extra plotting on selected sub-plots
    xs = np.linspace(0, 2*np.pi, 60)  # from 0 to 2pi
    ys = np.abs(np.sin(xs))           # absolute of sine

    # create figure (fig), and array of axes (ax)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    # plot simple raster image on each sub-plot
    for i, axi in enumerate(ax.flat):
        # i runs from 0 to (nrows*ncols-1)
        # axi is equivalent with ax[rowid][colid]
        img = img_list[i]

#         axi.imshow(img, cmap=plt.cm.bone)
        axi.imshow(img, cmap='gray')
        # get indices of row/column
        rowid = i // ncols
        colid = i % ncols
        # write row/col indices as axes' title for identification
#         axi.set_title("Row:"+str(rowid)+", Col:"+str(colid))
    
    # one can access the axes by ax[row_id][col_id]
    # do additional plotting on ax[row_id][col_id] of your choice

    plt.tight_layout(True)
    plt.show()
    return img_list
sample_img_list = plot_image_windows(train_df)

# Upvote if you find this useful XD