# siim-covid19-detection preprocessing

The intention is to create a new dataset in kaggle with all the .dcm files
resized and converted grayscale .png files. 

In addition, the csv files are processed, simplified and saved.

In [None]:
# Some of the images are packed and require gdcm module.
!pip install python-gdcm

In [None]:
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from PIL import Image
import pydicom
import shutil
from tqdm.auto import tqdm

In [None]:
target_shape = (768, 768)

In [None]:
# Create a dataframe for train data.

root = "/kaggle/input/siim-covid19-detection"

# Read the csv containing the image data
df_train_image = pd.read_csv(f"{root}/train_image_level.csv")

# Handle the boxes column
df_train_image.loc[df_train_image[df_train_image["boxes"].isna()].index, "boxes"] = "[]"
df_train_image["boxes"] = df_train_image["boxes"].apply(lambda x: eval(x))

# Add a column containing the filepaths to the .dcm files.
def get_filepath(row):
    identifier = row["id"].split("_")[0]
    filename = identifier + ".dcm"
    study_identifier = row["StudyInstanceUID"]
    for path, dirs, files in os.walk(f"{root}/train/{study_identifier}"):
        if filename in files:
            return f"{path}/{filename}"
    raise AssertionError("Could not find the file")

df_train_image["filepath"] = df_train_image.apply(get_filepath, axis=1)

# Assign better names for columns and take a subset of columns
df_train_image["image_id"] = df_train_image["id"].str.replace("_image", "")
df_train_image = df_train_image.rename(columns={"StudyInstanceUID": "study_id"})
df_train_image = df_train_image[["image_id", "study_id", "boxes", "filepath"]]

# Read the csv containing the study data
df_train_study = pd.read_csv(f"{root}/train_study_level.csv")

# Create a column containing the label
column_to_label = {
    "Negative for Pneumonia": "negative",
    "Typical Appearance": "typical",
    "Indeterminate Appearance": "indeterminate",
    "Atypical Appearance": "atypical"
}

def one_hot_to_label(row):
    """Given a one-hot encoding output its label"""
    for column, label in column_to_label.items():
        if row[column] == 1:
            return label
    raise AssertionError("Something went wrong")
        
df_train_study["label"] = df_train_study.apply(one_hot_to_label, axis=1)

# Create the same column as in the other data frame.
df_train_study["study_id"] = df_train_study["id"].str.replace("_study", "")

# Take a subset of columns
df_train_study = df_train_study[["study_id", "label"]]

# Combine the two dataframes
df = df_train_image.merge(
    df_train_study,
    how="outer",
    on="study_id"
)

df

In [None]:
# Create a dataframe for test data.

df_test = pd.DataFrame(columns=["image_id", "study_id", "filepath"])

i = 0
for dirpath, dirs, files in os.walk(f"{root}/test"):
    if not files:
        # Ignore folders without files
        continue
    study_id = dirpath.split("/")[-2]
    for filename in files:
        image_id = filename.split(".")[0]
        filepath = f"{dirpath}/{filename}"
        df_test.loc[i] = [image_id, study_id, filepath]
        i += 1
        
df_test

In [None]:
def get_image(filepath):
    """Read the file to memory and process it.
    
    Parameters
    ----------
    filepath : str
        Path the .dcm file.
    
    Returns
    -------
    PIL.Image
    """
    # Read the dcm file.
    dicom = pydicom.read_file(filepath)
        
    # Get the array. The array is by default uint dtype. This causes problems
    # in inverting the colors in the next step. Thus, for convinience converting
    # the dtype to be int which allows negative values.
    im = dicom.pixel_array.astype(int)

    # Make black 0 and white the largest number.
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        im = np.abs(im - im.max())
        
    # Convert the range to integers from 0 to 255
    im = ((im - im.min()) / (im.max() - im.min()) * 255).astype(np.uint8)
        
    # Convert to PIL image for easier processing later on.
    im = Image.fromarray(im)
    
    return im


def resize_image(im, target_shape):
    """Resize a given image.
    
    First rescales the image such that either width or height matches the 
    target_shape.
    
    Parameters
    ----------
    im : PIL.Image
    target_shape : tuple of ints
        (height, width)
    
    Returns
    -------
    PIL.Image, (x_scale, y_scale), (x_shift, y_shift)
    """
    original_shape = np.array(im).shape  # Get the shape for bounding box calculations.

    im.thumbnail(target_shape, Image.ANTIALIAS)

    tmp_shape = np.array(im).shape  # Get the current shape for further calculations.
    
    # The scales for bounding box calculations
    y_scale = tmp_shape[0] / original_shape[0]
    x_scale = tmp_shape[1] / original_shape[1]
    
    # Place the resized image at the center of a larger image.
    im_array = np.zeros(target_shape, dtype=np.uint8)
    y0 = (target_shape[0] - tmp_shape[0]) // 2
    y1 = int(np.floor(target_shape[0] - (target_shape[0] - tmp_shape[0]) / 2))
    x0 = (target_shape[1] - tmp_shape[1]) // 2
    x1 = int(np.floor(target_shape[1] - (target_shape[1] - tmp_shape[1]) / 2))
    im_array[y0:y1, x0:x1] = np.array(im)
    
    im = Image.fromarray(im_array)
    
    return im, (x_scale, y_scale), (x0, y0)

In [None]:
# TESTING
fig, axs = plt.subplots(2, 5, figsize=(20, 10))

for i, row in df.sample(5).reset_index().iterrows():
    im = get_image(row["filepath"])
    axs[0, i].imshow(im, cmap=plt.cm.bone)
    axs[0, i].set_title(row["label"])
    for box in row["boxes"]:
        rect = Rectangle((box["x"], box["y"]), box["width"], box["height"] ,linewidth=1, edgecolor='r', facecolor='none')
        axs[0, i].add_patch(rect)
        
    im, (x_scale, y_scale), (x0, y0) = resize_image(im, target_shape)
    axs[1, i].imshow(im, cmap=plt.cm.bone)
    for box in row["boxes"]:
        x = box["x"] * x_scale + x0
        y = box["y"] * y_scale + y0
        width = box["width"] * x_scale
        height = box["height"] * y_scale
        rect = Rectangle((x, y), width, height ,linewidth=1, edgecolor='r', facecolor='none')
        axs[1, i].add_patch(rect)

In [None]:
# Create the directories
dir_data = "/kaggle/working/data"
dir_train = f"{dir_data}/train"
os.makedirs(dir_train, exist_ok=True)
for label in column_to_label.values():
    os.makedirs(f"{dir_train}/{label}", exist_ok=True)

dir_test = f"{dir_data}/test"
os.makedirs(dir_test, exist_ok=True)

progress_bar = tqdm(range(df.shape[0] + df_test.shape[0]))

# Go through all the training images
for index, row in df.iterrows():
    # Read the image
    filepath = row["filepath"]
    im = get_image(filepath)

    im, (x_scale, y_scale), (x0, y0) = resize_image(im, target_shape)
    
    # Modify the boxes
    for box in row["boxes"]:
        box["x"] = box["x"] * x_scale + x0
        box["y"] = box["y"] * y_scale + y0
        box["width"] *= x_scale
        box["height"] *= y_scale
    
    # Save the image to new location.
    label = row["label"]
    new_filename = os.path.basename(filepath).split(".")[0] + ".png"
    im.save(f"{dir_train}/{label}/{new_filename}")

    progress_bar.update(1)


# Go through all the test images.
for index, row in df_test.iterrows():
    # Read the image
    filepath = row["filepath"]
    im = get_image(filepath)
    
    im, (x_scale, y_scale), (x0, y0) = resize_image(im, target_shape)
    
    # Save the image to new location.
    new_filename = os.path.basename(filepath).split(".")[0] + ".png"
    im.save(f"{dir_test}/{new_filename}")

    progress_bar.update(1)

In [None]:
# Save the dataframes as csv
df.drop("filepath", axis=1).to_csv(f"{dir_data}/train.csv")
df_test.drop("filepath", axis=1).to_csv(f"{dir_data}/test.csv")

In [None]:
# Zip the images so that it is possible to create a new dataset in Kaggle.
shutil.make_archive("/kaggle/working/data", 'zip', dir_data)

# Remove the data folder so that the output zip appears as output. For some
# reasong Kaggle will just show the images.
shutil.rmtree(dir_data)