In [None]:
! conda install -c conda-forge gdcm -y

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from ast import literal_eval
import json

In [None]:
extract_path = "/kaggle/input/siim-covid-512-reshaped-corrected-bounded-box/Extracted_Study_Series_Img.csv"
df = pd.read_csv(extract_path)
df.head()

In [None]:
df.tail()

# Resized Data Paths

In [None]:
import os
data_dir = "/kaggle/input/siim-covid-512-reshaped-corrected-bounded-box/resized_data/resized_data"
New_Image_Path = []
for index, row in df.iterrows():
    img_path = row["Image_Name"]
    curr_set = row["Set_Name"]
    png_path = img_path.replace('.dcm','.png').zfill(16)
    new_path = os.path.join(data_dir,curr_set,png_path)
    New_Image_Path.append(new_path)
df["New_Image_Path"] = New_Image_Path

In [None]:
len("0026720152f5.png")

In [None]:
n = 20
train_df = df[df["Set_Name"] == "train"]
sample_train = train_df.sample(n)
sample_train.reset_index(inplace = True)

test_df = df[df["Set_Name"] == "test"]
sample_test = test_df.sample(n)
sample_test.reset_index(inplace = True)

In [None]:
sample_train.head(2)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
%matplotlib inline
import cv2
plt.style.use("dark_background")

In [None]:
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import PIL
from PIL import Image
from colorama import Fore, Back, Style
viz_counter=0

def create_dir(dir, v=1):
    """
    Creates a directory without throwing an error if directory already exists.
    dir : The directory to be created.
    v : Verbosity
    """
    if not os.path.exists(dir):
        os.makedirs(dir)
        if v:
            print("Created Directory : ", dir)
        return 1
    else:
        if v:
            print("Directory already existed : ", dir)
        return 0

voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    #====== DICOM IMAGE DATA ======
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
    else:
        data = dicom_header.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    modified_image_data = (data * 255).astype(np.uint8)
    
    return dicom_dict, modified_image_data

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv
"""
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])
"""
def view_data(sample_train,view="original"):
    if view=="resized":
        fig, axs = plt.subplots(4, 5, figsize=(20,20))
        fig.subplots_adjust(hspace=.2, wspace=.2)
        axs = axs.ravel()
        for i in range(n):
            img = cv2.imread(sample_train['New_Image_Path'][i])
            axs[i].imshow(img,cmap='gray')
            if type(sample_train['corrected_boxes'][i])==str and (not sample_train['corrected_boxes'][i]=="") and (not sample_train['corrected_boxes'][i]==np.nan) :
                boxes = literal_eval(sample_train['corrected_boxes'][i])
                for box in boxes:
                    axs[i].add_patch(Rectangle((box['x'], box['y']), box['width'], box['height'], fill=0, color='y', linewidth=2))
                axs[i].set_title(sample_train['study_label'][i])
            else:
                axs[i].set_title(sample_train['study_label'][i])
    if view=="original":
        fig, axs = plt.subplots(4, 5, figsize=(20,20))
        fig.subplots_adjust(hspace=.2, wspace=.2)
        axs = axs.ravel()
        for i in range(n):
            fpath = sample_train['Image_Path'][i]
            dicom_dict, modified_image_data = dicom_dataset_to_dict(fpath)
            # print(modified_image_data.shape)
            # rgb2gray = lambda rgb : np.dot(rgb[... , :3] , [0.299 , 0.587, 0.114])   
            img = modified_image_data
            axs[i].imshow(img,cmap='gray')
            if type(sample_train['boxes'][i])==str and (not sample_train['boxes'][i]=="") and (not sample_train['boxes'][i]==np.nan):
                boxes = literal_eval(sample_train['boxes'][i])
                for box in boxes:
                    axs[i].add_patch(Rectangle((box['x'], box['y']), box['width'], box['height'], fill=0, color='y', linewidth=2))
                axs[i].set_title(sample_train['study_label'][i])
            else:
                axs[i].set_title(sample_train['study_label'][i])
        

# View Original Train Data

In [None]:
view_data(sample_train,view="original")

# View Resized Train Data

In [None]:
view_data(sample_train,view="resized")

# Now go ahead and build some awesome Object Detection Models!

Hurray! It displays the same data! Also let's try to build a mock submission file that takes in predictions from a random model and then create the `submission.csv` file. 

In [None]:
sample_test.head(2)

In [None]:
view_data(sample_test,view="resized")

In [None]:
df.tail(2)

In [None]:
!pip install openpyxl

In [None]:
df.to_csv('Resized_MetaData.csv',index=False)
df.to_excel('Resized_MetaData.xlsx',index=False)

In [None]:
def subtract_lists(x,y):
    """Subtract Two Lists (List Difference)"""
    return [item for item in x if item not in y]
def merge_list_to_dict(test_keys,test_values):
    """Using dictionary comprehension to merge two lists to dictionary"""
    merged_dict = {test_keys[i]: test_values[i] for i in range(len(test_keys))}
    return merged_dict
# CLASSES = subtract_lists(list(set(df["study_label"])),[np.nan])
CLASSES = ['negative','indeterminate', 'typical',  'atypical'] # keep negative at start

In [None]:
# IMAGE_LABELS = subtract_lists(list(set(df["image_label"])),[np.nan])
IMAGE_LABELS = ['none','opacity']

In [None]:
np.random.seed(42)

# Get Dummy Predictions

In [None]:
sub_pth = "../input/siim-covid19-detection/sample_submission.csv"
df_sub = pd.read_csv(sub_pth)
df_sub.head()

In [None]:
df_sub.tail()

In [None]:
idx_img = []
idx_std = []

for index, row in df_sub.iterrows():
    if row["id"].endswith('_image'):
        idx_img.append(int(index))
    if row["id"].endswith('_study'):
        idx_std.append(int(index))

In [None]:
len(idx_img)

In [None]:
len(idx_std)

In [None]:
df_sub.shape

In [None]:
df_sub_img = df_sub.iloc[idx_img]
df_sub_std = df_sub.iloc[idx_std]

In [None]:
df_sub_img.head()

In [None]:
df.head(2)

In [None]:
df_sub_img["Image_ID"] = df_sub_img["id"].str.replace("_image","")

In [None]:
df_img = df_sub_img.merge(df,on="Image_ID")
df_img.head()

In [None]:
df_sub_img.shape

In [None]:
df_img.shape

In [None]:
CLASSES

In [None]:
CLASS_LABELLINGS = merge_list_to_dict(CLASSES,list(range(len(CLASSES))))
CLASS_LABELLINGS

In [None]:
def get_preds(img = np.zeros((512,512))):
    """
    Returns Classes and BBoxes
    """
    Shape_X,Shape_Y = img.shape
    Height_X = Shape_X/4
    Height_Y = Shape_Y/4
    num_preds = np.random.randint(low=0,high=4)
    # print(num_preds4
    CLASSES_IDX = []
    CONFS = []
    
    BBOX = []
    for i in range(num_preds):
        CLS = np.random.randint(low=1,high=4) # 1,2,3
        conf = np.random.randint(low=7,high=11)/10 # Confidence > 7 - 7-10
        x = np.random.randint(low=40,high=61) * Height_X/100
        y = np.random.randint(low=40,high=61) * Height_Y/100

        init_x = np.random.randint(low=Height_X,high=(Shape_X-Height_X))
        init_y = np.random.randint(low=Height_Y,high=(Shape_Y-Height_Y))

        data = {"x":init_x,
                "y":init_y,
                "width":x,
                "height":y}
        
        BBOX.append(data)
        CLASSES_IDX.append(CLS)
        CONFS.append(conf)
    
    return CLASSES_IDX,BBOX,CONFS

    
get_preds()

Target:

```
Id,PredictionString
2b95d54e4be65_study,negative 1 0 0 1 1
2b95d54e4be66_study,typical 1 0 0 1 1
2b95d54e4be67_study,indeterminate 1 0 0 1 1 atypical 1 0 0 1 1
2b95d54e4be68_image,none 1 0 0 1 1
2b95d54e4be69_image,opacity 0.5 100 100 200 200 opacity 0.7 10 10 20 20
etc.
```

In [None]:
df_img["confidences"] = str([])
df_img.head(2)

In [None]:
std_value_cnts = df_img["Study_Name"].value_counts()

In [None]:
greater_than_one = std_value_cnts>1
greater_than_one.sum()

### Ignoring these for now.
Will be clarified once the confusion is dealt with.

See Discussion : https://www.kaggle.com/c/siim-covid19-detection/discussion/244189 for more intricate details.

In [None]:
CLASS_LABELLINGS

In [None]:
CLASS_VALS = {v: k for k, v in CLASS_LABELLINGS.items()}
CLASS_VALS

In [None]:
"""
IMG_PREDS = []
STUDY_PREDS = []
"""
for index, row in df_img.iterrows():
    img_path = row["New_Image_Path"]
    img = cv2.imread(img_path,0)
    # print(img.shape)
    CLASSES_IDX,BBOX,CONFS = get_preds(img)
    
    if CLASSES_IDX==[]:
        df_img.loc[index, "image_label"] = "none"
        df_img.loc[index, "study_label"] = "negative"
        df_img.loc[index, "confidences"] = str([])
        df_img.loc[index, "corrected_boxes"] = str([])
    else:
        PRED_CLASSES = []
        for cls_idx in CLASSES_IDX:
            e_cls = CLASS_VALS[cls_idx]
            PRED_CLASSES.append(e_cls)
        df_img.loc[index, "image_label"] = "opacity"
        df_img.loc[index, "study_label"] = ",".join(PRED_CLASSES)
        df_img.loc[index, "confidences"] = str(CONFS)
        df_img.loc[index, "corrected_boxes"] = str(BBOX)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_img.head()

In [None]:
df_sub.head()

# Upscale the Predictions

Remember that we have downscaled the image to a `(512,512)` shape before and we need to adjust the bounding boxes accordingly again.

In [None]:
df_img['ecfy'] = 1/df_img['cfy']
df_img['ecfx'] = 1/df_img['cfx']

In [None]:
from ast import literal_eval
for index, row in df_img.iterrows():
    cbbox =  literal_eval(row["corrected_boxes"])
    OLD_BOXES = []
    for each_box in cbbox:
        data = {"x":each_box["x"]*row["ecfy"],
                "y":each_box["y"]*row["ecfx"],
                "width":each_box["width"]*row["ecfy"],
                "height":each_box["height"]*row["ecfx"]}
        OLD_BOXES.append(data)
    df_img.loc[index, "boxes"] = str(OLD_BOXES)

In [None]:
df_img.head()

# Re-Verification of Scaling

In [None]:
view_data(df_img,view="resized")

In [None]:
view_data(df_img,view="original")

### Hooray! The Scaled Images have been brought back to the original size

# Prepare Submission

### Image Level

In [None]:
df_img.head()

In [None]:
df_img["Pred_Img"] = ""
df_img["Pred_Std"] = ""

In [None]:
df_img.tail(2)

```
Id,PredictionString
2b95d54e4be65_study,negative 1 0 0 1 1
2b95d54e4be66_study,typical 1 0 0 1 1
2b95d54e4be67_study,indeterminate 1 0 0 1 1 atypical 1 0 0 1 1
2b95d54e4be68_image,none 1 0 0 1 1
2b95d54e4be69_image,opacity 0.5 100 100 200 200 opacity 0.7 10 10 20 20
etc.
```

In [None]:
for index, row in df_img.iterrows():
    
    Pred_Img = ""
    Pred_Std = ""
    study_label = row["study_label"]
    if study_label == "negative":
        Pred_Img = "negative 1 0 0 1 1"
        Pred_Std = "none 1 0 0 1 1"
        # maybe insert confidences here (instead of the initial 1)!
    else:
        all_cls = study_label.split(",") if "," in study_label else [study_label]
        bboxes =  literal_eval(row["boxes"])
        confs = literal_eval(row["confidences"])
        for each_class,each_box,each_conf in zip(all_cls,bboxes,confs):
            # opacity 0.5 100 100 200 200 opacity 0.7 10 10 20 20
            Pred_Img += "opacity "+str(round(each_conf,1))+" " + str(round(each_box["x"],2))
            Pred_Img +=  " "+ str(round(each_box["y"],2))
            Pred_Img +=  " " + str(round(each_box["width"],2))+" " + str(round(each_box["height"],2)) + " "
            # indeterminate 1 0 0 1 1 atypical 1 0 0 1 1
            Pred_Std += each_class + " 1 0 0 1 1 "
    df_img.loc[index, "Pred_Img"] = str(Pred_Img)
    df_img.loc[index, "Pred_Std"] = str(Pred_Std)

In [None]:
df_img["Pred_Img"] = df_img["Pred_Img"].str.strip()
df_img["Pred_Std"] = df_img["Pred_Std"].str.strip()

In [None]:
df_img.head(10)

In [None]:
df_img.to_csv('Prediction_Data.csv',index=False)
df_img.to_excel('Prediction_Data.xlsx',index=False)

In [None]:
for index, row in df_sub.iterrows():
    img_id = row["id"]
    try:
        if img_id.endswith('_study'):
            idx = df_img.index[df_img['Study_Name'] == img_id.replace("_study","")].tolist()
            p = df_img["Pred_Std"][idx[0]]
        elif img_id.endswith('_image'):
            idx = df_img.index[df_img['Image_ID'] == img_id.replace("_image","")].tolist()
            p = df_img["Pred_Img"][idx[0]]
        df_sub.loc[index, "PredictionString"] = str(p)
    except:
        continue

In [None]:
df_sub.head()

In [None]:
df_sub.tail()

In [None]:
df_sub.to_csv('Random_Submission.csv',index=False)
df_sub.to_excel('Random_Submission.xlsx',index=False)