### Note that all images are stored in paths with the form `set`/`study`/`series`/`image`

We will try to extract this information into a dataframe. This will be needed when inferencing on models and preparing the submission files!

In [None]:
! conda install -c conda-forge gdcm -y

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
IMG_FORMAT = ".dcm"
IMG_PATHS = []
IMAGE_IDS = []
IMAGE_NAMES = []
SETS = []
SERIES = []
STUDIES = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith(IMG_FORMAT):
            img_path = os.path.join(dirname, filename)
            Splitted = img_path.split('/')
            # print(Splitted)
            img_name = os.path.basename(img_path)
            img_id = img_name.rstrip(IMG_FORMAT)
            series_name = Splitted[-2]
            study_name = Splitted[-3]
            set_name = Splitted[-4]
            IMG_PATHS.append(img_path)
            IMAGE_NAMES.append(img_name)
            IMAGE_IDS.append(img_id)
            SETS.append(set_name)
            SERIES.append(series_name)
            STUDIES.append(study_name)

In [None]:
df_ext = pd.DataFrame.from_dict({"Image_Path":IMG_PATHS,
                             "Image_Name":IMAGE_NAMES,
                             "Image_ID":IMAGE_IDS,
                             "Set_Name": SETS,
                             "Series_Name":SERIES,
                             "Study_Name":STUDIES})
df_ext.head()

In [None]:
img_lvl_pth = "../input/siim-covid19-detection/train_image_level.csv"
df_img = pd.read_csv(img_lvl_pth)
df_img.sort_values(by=['id'],inplace=True)
df_img.head()

In [None]:
df_img.isna().sum()

In [None]:
std_lvl_pth = "../input/siim-covid19-detection/train_study_level.csv"
df_std = pd.read_csv(std_lvl_pth)
df_std['id'] = df_std['id'].str.replace('_study',"")
df_std.rename({'id': 'StudyInstanceUID'},axis=1, inplace=True)
df_std.head(3)
# df_std.sort_values(by=['StudyInstanceUID'],inplace=True)
df_std.head()

In [None]:
sub_pth = "../input/siim-covid19-detection/sample_submission.csv"
df_sub = pd.read_csv(sub_pth)
df_sub.head()

In [None]:
df = df_img.merge(df_std, on='StudyInstanceUID')
df.head(3)

In [None]:
from copy import deepcopy
df_train = deepcopy(df)

In [None]:
df_train.loc[df_train['Negative for Pneumonia']==1, 'study_label'] = 'negative'
df_train.loc[df_train['Typical Appearance']==1, 'study_label'] = 'typical'
df_train.loc[df_train['Indeterminate Appearance']==1, 'study_label'] = 'indeterminate'
df_train.loc[df_train['Atypical Appearance']==1, 'study_label'] = 'atypical'
df_train.drop(['Negative for Pneumonia','Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance'], axis=1, inplace=True)
df_train['id'] = df_train['id'].str.replace('_image', IMG_FORMAT)

def get_label(string):
    return string.split()[0]

df_train['image_label'] = df_train['label'].map(get_label)
df_train.head(3)

In [None]:
df_train.columns

In [None]:
df_train["image_label"].value_counts()

In [None]:
df_ext.head(3)

In [None]:
string = "a29c5a68b07b"
string.zfill(15)

In [None]:
df_train.shape

In [None]:
df_ext["id"] = df_ext["Image_Name"].str.zfill(19)
df_ext.shape

In [None]:
df_ext.head(3)

In [None]:
df_train.head(3)

### Create a dummy df with NaN values for test 

In [None]:
df_ext["Set_Name"].value_counts()

In [None]:
df_test = df_ext[df_ext["Set_Name"]=="test"]
df_test.head(3)

In [None]:
df_train.head(3)

### Fill df_train_test with nan data

In [None]:
df_train_test = deepcopy(df_train)
COLS = list(df_train_test.columns)
def subtract_lists(x,y):
    """Subtract Two Lists (List Difference)"""
    return [item for item in x if item not in y]
def merge_list_to_dict(test_keys,test_values):
    """Using dictionary comprehension to merge two lists to dictionary"""
    merged_dict = {test_keys[i]: test_values[i] for i in range(len(test_keys))}
    return merged_dict
# NAN_COLS = subtract_lists(COLS,["id"])
TO_ATTACH = merge_list_to_dict(COLS,[np.nan]*len(COLS))
for index, row in df_test.iterrows():
    TO_ATTACH["id"] = row["id"]
    df_train_test = df_train_test.append(TO_ATTACH, ignore_index = True)
df_train_test.head(3)

In [None]:
df_train_test.tail(3)

In [None]:
df_train_test.shape

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.shape[0] + df_test.shape[0]

In [None]:
df_train_test["id"] = df_train_test["id"].str.zfill(19)
df_train_test.head(3)

In [None]:
df_ext["id"] = df_ext["id"].str.zfill(19)
df_ext.head(3)

In [None]:
df_train_test.shape

In [None]:
df_ext.shape

In [None]:
df = df_ext.merge(df_train_test, on='id')
df.head(3)

In [None]:
df["Set_Name"].value_counts()

In [None]:
df.shape

### Utility Functions

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')
import shutil 
import tensorflow as tf
%matplotlib inline


import matplotlib.pylab as pylab
import seaborn as sns
import pprint
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import wandb

import PIL
from PIL import Image
from colorama import Fore, Back, Style
viz_counter=0

def create_dir(dir, v=1):
    """
    Creates a directory without throwing an error if directory already exists.
    dir : The directory to be created.
    v : Verbosity
    """
    if not os.path.exists(dir):
        os.makedirs(dir)
        if v:
            print("Created Directory : ", dir)
        return 1
    else:
        if v:
            print("Directory already existed : ", dir)
        return 0

voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    #====== DICOM IMAGE DATA ======
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
    else:
        data = dicom_header.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    modified_image_data = (data * 255).astype(np.uint8)
    
    return dicom_dict, modified_image_data

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv


import os, fnmatch
def find(pattern, path):
    """Utility to find files wrt a regex search"""
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result
def props(arr):
    print("Shape :",arr.shape,"Maximum :",arr.max(),"Minimum :",arr.min(),"Data Type :",arr.dtype)

In [None]:
path = "/kaggle/input/siim-covid19-detection/test/00188a671292/3eb5a506ccf3/3dcdfc352a06.dcm"
dicom_dict, modified_image_data = dicom_dataset_to_dict(path)
props(modified_image_data)

### Define your Required Reshape Size Here!

### Get Original Image Shapes from Dicom Images

In [None]:
from tqdm import tqdm

In [None]:
"""
Shapes that you wish to resize to
"""

Shape_X = 512
Shape_Y = 512
image_id = []
dim0 = []
dim1 = []
splits = []
img_paths = []

for split in ['test', 'train']:
    # save_dir = f'/kaggle/tmp/{split}/'
    save_dir = f'/kaggle/working/resized_data/{split}/'
    print(split)
    os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'/kaggle/input/siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            fpath = os.path.join(dirname, file)
            dicom_dict, modified_image_data = dicom_dataset_to_dict(fpath)
            res = cv2.resize(modified_image_data,(Shape_Y,Shape_X)) # cv2 has this opposite
            save_path = os.path.join(save_dir, file.replace('dcm', 'png'))
            cv2.imwrite(save_path,res)
            img_id = file.replace('.dcm', '')
            image_id.append(img_id)
            dim0.append(modified_image_data.shape[0])
            dim1.append(modified_image_data.shape[1])
            img_paths.append(fpath)
            splits.append(split)
"""
2475/?
12386/?
07:34 | 5.38it/s
36:51 | 8.13it/s
"""
print("Generation Complete!")


In [None]:
import os
import zipfile
import shutil

#taken from : https://www.kaggle.com/xhlulu/recursion-2019-load-resize-and-save-images

def zip_and_remove(path):
    ziph = zipfile.ZipFile(f'{path}.zip', 'w', zipfile.ZIP_DEFLATED)
    
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            ziph.write(file_path)
            os.remove(file_path)
    
    ziph.close()
    shutil.rmtree(path)
save_dir = 'resized_data'
zip_and_remove(save_dir)

In [None]:
new_df = pd.DataFrame.from_dict({'Image_Path': img_paths, 'dim0': dim0, 'dim1': dim1})
new_df.head(3)

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
new_df.shape

In [None]:
final_df = df.merge(new_df,on="Image_Path")
final_df.head()

In [None]:
final_df.shape

In [None]:
final_df.tail()

###  Rescaling Imgs : Bounding Box Rescaling Needed

Remember that if you're trying to rescale images, the bounding boxes need to be reshaped as well!

##### Note : The BBox have shifted due to resizing

A rectangle defined via an anchor point xy and its width and height.

The rectangle extends from xy[0] to xy[0] + width in x-direction and from xy[1] to xy[1] + height in y-direction.

```
:                +------------------+
:                |                  |
:              height               |
:                |                  |
:               (xy)---- width -----+
```

In [None]:
# import cv2
# how to load a string to json
# import ast
# jsonobj = ast.literal_eval(str(hdf))

from ast import literal_eval

In [None]:
n = len(final_df)
# Already defined above : Shape_Y,Shape_X = 512,512
NEW_BOXES = []
for i in range(n):
    if type(final_df['boxes'][i])==str:
        boxes = literal_eval(final_df['boxes'][i])
        BIG_BOX = []
        for box in boxes:
            xbase,ybase = (box['x']*(Shape_Y/final_df['dim1'][i]), box['y']*(Shape_X/final_df['dim0'][i]))
            new_width,new_height = box['width']*(Shape_Y/final_df['dim1'][i]), box['height']*(Shape_X/final_df['dim0'][i])
            CURR_BOX = {"x": xbase,
                        "y" : ybase,
                        "width" : new_width,
                        "height" : new_height}
            BIG_BOX.append(CURR_BOX)
    else:
        BIG_BOX = ""
    NEW_BOXES.append(str(BIG_BOX))

In [None]:
final_df['corrected_boxes'] = NEW_BOXES

In [None]:
# Correction Factors
final_df['cfy'] = Shape_Y/final_df['dim1']
final_df['cfx'] = Shape_X/final_df['dim0']

In [None]:
!pip install openpyxl

In [None]:
final_df.to_csv('Extracted_Study_Series_Img.csv',index=False)
final_df.to_excel('Extracted_Study_Series_Img.xlsx',index=False)

### Visualize

Incoming in the next Notebook!

In [None]:
"""
subset_df = final_df[final_df["Set_Name"]=="train"].sample(n=20,random_state=2021)
subset_df.head()
for path in subset_dcm_files:
    dicom_dict, modified_image_data = dicom_dataset_to_dict(path)
    res = cv2.bitwise_and(resized_image_data,resized_image_data,mask = pred_img_preprocessed)
    fig, ax = plt.subplots(1, 3, figsize=(20, 12))
    ax[0].imshow(resized_image_data, cmap="viridis")
    ax[0].axis('off')
    ax[1].imshow(pred_img_preprocessed, cmap="viridis")    
    ax[1].axis('off')
    ax[2].imshow(res, cmap="viridis")    
    ax[2].axis('off')
    plt.savefig(str(viz_counter)+".png",dpi=300)
    viz_counter+=1
    cv2.imwrite(str(viz_counter)+".png",res)
    viz_counter+=1
    plt.show()
"""