## Let's have a quick look...

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data and file processing
import sklearn as sk  # machine learning
import seaborn as sns  # data visualization
import matplotlib.pyplot as plt  # data visualization
import matplotlib
%matplotlib inline

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.densenet import DenseNet121
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model
from keras import backend as K
from keras.models import load_model

In [None]:
import os, ast, cv2, random
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from skimage import exposure

In [None]:
home = "../input/siim-covid19-detection/"
img_dir = home+'train/'
os.listdir(home)

In [None]:
## -- Read csv file containing training data
df_train_study = pd.read_csv(home+"train_study_level.csv")

# Print first 5 rows
print(f'There are {df_train_study.shape[0]} rows and {df_train_study.shape[1]} columns in this data frame')
df_train_study.head()

In [None]:
df_train_study.info()

In [None]:
## -- Read csv file containing training data
df_train_image = pd.read_csv(home+"train_image_level.csv")

# Print first 5 rows
print(f'There are {df_train_image.shape[0]} rows and {df_train_image.shape[1]} columns in this data frame')
df_train_image.head()

In [None]:
df_train_image.info()

In [None]:
## -- Helper function borrowed from @raddar  
## https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way

def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    dicom.BitsStored = 16  # added
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
img = read_xray(img_dir+'00086460a852/9e8302230c91/65761e66de9f.dcm')
plt.figure(figsize = (12,12))
plt.imshow(img, 'gray')

In [None]:
## for uniform pixel distribution
img = read_xray(img_dir+'00086460a852/9e8302230c91/65761e66de9f.dcm')
img = exposure.equalize_hist(img)
plt.figure(figsize = (12,12))
plt.imshow(img, 'gray')

In [None]:
## for CLAHE normalization
img = read_xray(img_dir+'00086460a852/9e8302230c91/65761e66de9f.dcm')
img = exposure.equalize_adapthist(img/np.max(img))
plt.figure(figsize = (12,12))
plt.imshow(img, 'gray')

## Merge the training sets

In [None]:
## merge study csv -- borrowed from yujiariyasu
df_train_study['StudyInstanceUID'] = df_train_study['id'].apply(lambda x: x.replace('_study', ''))
del df_train_study['id']
train = df_train_image.merge(df_train_study, on='StudyInstanceUID')
train.head()

In [None]:
## -- check for class imbalance

classes = ['Negative for Pneumonia', 'Typical Appearance', 
                 'Indeterminate Appearance', 'Atypical Appearance']
plt.figure(figsize = (12,6))
plt.bar([1,2,3,4], train[classes].values.sum(axis=0))
plt.xticks([1,2,3,4],classes)
plt.ylabel('Count')
plt.show()