In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
from glob import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
import matplotlib
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_dir = '../input/vinbigdata-chest-xray-abnormalities-detection'

In [None]:
image_count = len(list(glob(f'{data_dir}/train/*.dicom')))
print('Image count is : ' + str(image_count))

In [None]:
#Time to read the csv file
%time
train = pd.read_csv(data_dir+'/train.csv')

In [None]:
train.info(memory_usage="deep")

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

In [None]:
def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

In [None]:
%%time
dicom_paths = glob(f'{data_dir}/train/*.dicom')
imgs = [dicom2array(path) for path in dicom_paths[:12]]

#Without Plot
#12.7 seconds for 12 images

#Time doesn't seem to be increasing exponentially

In [None]:
%%time
dicom_paths = glob(f'{data_dir}/train/*.dicom')
imgs = [dicom2array(path) for path in dicom_paths[:12]]
plot_imgs(imgs)

#With Plot
#5 seconds for 2 images
#11 seconds for 5 images
#18 seconds for 12 images

#Time doesn't seem to be increasing exponentially

###### Performing Histogram Equalization on these images

In [None]:
imgs = [exposure.equalize_hist(img) for img in imgs]
plot_imgs(imgs)

In [None]:
train_data = pd.read_csv(data_dir+'/train.csv')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
x = train_data['class_name'].value_counts().keys()
y = train_data['class_name'].value_counts().values
ax.bar(x, y)
ax.set_xticklabels(x, rotation=90)
ax.set_title('Distribution of the labels')
plt.grid()
plt.show()

In [None]:
import pydicom as dicom

def plot_example(idx_list):
    fig, axs = plt.subplots(1, 3, figsize=(15, 10))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    axs = axs.ravel()
    for i in range(3):
        image_id = train_data.loc[idx_list[i], 'image_id']
        data_file = dicom.dcmread(data_dir+'/train/'+image_id+'.dicom')
        img = data_file.pixel_array
        axs[i].imshow(img, cmap='gray')
        axs[i].set_title(train_data.loc[idx_list[i], 'class_name'])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
        if train_data.loc[idx_list[i], 'class_name'] != 'No finding':
            bbox = [train_data.loc[idx_list[i], 'x_min'],
                    train_data.loc[idx_list[i], 'y_min'],
                    train_data.loc[idx_list[i], 'x_max'],
                    train_data.loc[idx_list[i], 'y_max']]
            p = matplotlib.patches.Rectangle((bbox[0], bbox[1]),
                                             bbox[2]-bbox[0],
                                             bbox[3]-bbox[1],
                                             ec='r', fc='none', lw=2.)
            axs[i].add_patch(p)
            
for num in range(15):
    idx_list = train_data[train_data['class_id']==num][0:3].index.values
    plot_example(idx_list)

In [None]:
print(train_data)

In [None]:
import torch
from torch import nn
import glob
from PIL import Image
from tqdm.auto import tqdm
import torchvision.transforms as transforms
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
print("Train Data Size : {}".format(train_data.shape[0]))
train_data.head()

In [None]:
class_labels = train_data.iloc[:,[0,2]]
class_labels.head()

In [None]:
class_labels.iloc[:,0:2]
class_labels = class_labels.drop_duplicates(subset=["image_id"])
class_labels.head()
class_labels.shape

In [None]:
train_on_gpu = False

if torch.cuda.is_available():
    train_on_gpu = True

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # convolutional layer (sees 512x512x3 image tensor)
        self.conv1 = nn.Conv2d(1, 4, 3, padding=1)

        # convolutional layer (sees 256x256x4 tensor)
        self.conv2 = nn.Conv2d(4, 8, 3, padding=1)
        
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)