# Pre-processing

In [None]:
# Install pydicom
!conda install '/kaggle/input/pydicom-conda-helper/libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/gdcm-2.8.9-py37h500ead1_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/conda-4.10.1-py37h89c1867_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/certifi-2020.12.5-py37h89c1867_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/openssl-1.1.1k-h7f98852_0.tar.bz2' -c conda-forge -y

In [None]:
# Import packages
import os
import glob
import cv2
import pickle
import json
import numpy as np
import pandas as pd

from pathlib import Path
from PIL import Image
from tqdm import tqdm
from tqdm.auto import tqdm as tqdm_auto

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
# Args
data_dir = Path('/kaggle/input/siim-covid19-detection')
save_dir = Path('/kaggle/working/data/siim')
test_csv_path = data_dir / 'sample_submission.csv'
test_dcm_dir = data_dir / 'test'
image_size = 2048
debug = False

In [None]:
# Read csv
print(f'Read csv from {test_csv_path}')
sub = pd.read_csv(test_csv_path)
study_sub = sub[sub.id.apply(lambda x: x.endswith('study'))]  # Study-only dataframe
image_sub = sub[sub.id.apply(lambda x: x.endswith('image'))]  # Study-only dataframe

# Read dicom file list
print(f'Read dcm from {test_dcm_dir}')
dcm_paths = list(test_dcm_dir.glob('**/*.dcm'))

In [None]:
# Subsample for debugging
if debug:
    dcm_paths = dcm_paths[:10]
    studies = list(set([str(d).split('/')[-3] + '_study' for d in dcm_paths]))
    images = list(set([str(d).split('/')[-2] + '_image' for d in dcm_paths]))
    study_sub = study_sub[study_sub.id.isin(studies)]
    image_sub = image_sub[image_sub.id.isin(images)]

In [None]:
# Make directories to save jpg
jpg_dir = save_dir / f'{image_size}x{image_size}'
print(f'Create directory to save jpg: {jpg_dir}')
jpg_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def read_xray(path, voi_lut=True, fix_monochrome=True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data


def resize_xray(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    return im

In [None]:
# Convert dcm to jpg
image_ids = []
folder_ids = []
study_ids = []
widths = []
heights = []
for path in tqdm(dcm_paths, total=len(dcm_paths)):
    path_split = str(path).split('/')
    study_id = path_split[-3]
    folder_id = path_split[-2]
    image_name = path_split[-1].replace('.dcm', '')
            
    xray = read_xray(path)
    im_path = save_dir / f'{image_size}x{image_size}' / (image_name + '.jpg')
    if not im_path.exists():
        im = resize_xray(xray, size=image_size)
        im.save(im_path)
    
    image_ids.append(image_name)
    folder_ids.append(folder_id)
    study_ids.append(study_id)
    widths.append(xray.shape[1])
    heights.append(xray.shape[0])

test_df = pd.DataFrame.from_dict({'id': image_ids,
                                  'folder_id':folder_ids,
                                  'StudyInstanceUID': study_ids,
                                  'width': widths,
                                  'height': heights})
test_df.to_csv(save_dir / 'test_data.csv', index=False)

In [None]:
# Convert meta to mmdet format
anno_dir = save_dir / 'mmdet_annos'
anno_path = anno_dir / 'test.json'

mmdet_anno = {
  "info": {
    "description": "SIIM Covid-19",
    "year": 2021,
    "url": None,
    "version": None,
    "contributor": None,
  },
  "licenses": [
    {
      "id": 0,
      "url": None,
      "name": None
    }
  ],
  "categories": [
    {
      "id": 0,
      "supercategory": None,
      "name": "opacity"
    }
  ],
  "type": "instances",
  "annotations": [],
  "images": []
}

for idx, row in test_df.iterrows():
    image = {
        'license': 0,
        'url': None,
        'file_name': row.id + '.jpg',
        'height': image_size,
        'width': image_size,
        'date_captured': None,
        'id': idx
    }
    mmdet_anno['images'].append(image)

anno_dir.mkdir(parents=True, exist_ok=True)
print(f'Save mmdet anno to {anno_path}')
json.dump(mmdet_anno, open(anno_path, 'w'), indent=2)

# Inference

In [None]:
# Install mmdetection packages
%cd /kaggle/input/mmdetection-swin-env/mmdetection_swin
!pip install --no-index -f wheels/mmcv-full/ mmcv-full
!pip install --no-index -f wheels/runtime/ -r requirements/runtime.txt
!pip install ".[runtime]"
!pip install --no-index -f wheels/ensemble-boxes/ ensemble-boxes

import ensemble_boxes

In [None]:
# Run test
config = Path('configs/siim/swin_kaggle_many.py')
out_dir = Path('/kaggle/working/output')
checkpoints = [
    '/kaggle/input/swin-checkpoints/swin_v3_expt8_fold0/epoch_30.pth',
    '/kaggle/input/swin-checkpoints/swin_v3_expt8_fold1/epoch_20.pth',
    '/kaggle/input/swin-checkpoints/swin_v3_expt8_fold2/epoch_15.pth',
    '/kaggle/input/swin-checkpoints/swin_v3_expt8_fold3/epoch_27.pth',
    '/kaggle/input/swin-checkpoints/swin_v3_expt8_fold4/epoch_22.pth',
]
out_pkls = [out_dir / f'result_{n}.pkl' for n in range(len(checkpoints))]

for ckpt, out_pkl in zip(checkpoints, out_pkls):
    out_dir.mkdir(parents=True, exist_ok=True)
    test_cmd = f'python tools/test.py {config} {ckpt} --out {out_pkl}'
    print(test_cmd)
    !{test_cmd}

In [None]:
# Visualize for debugging
if debug:
    import time
    from IPython.display import clear_output
    import matplotlib.pyplot as plt
    %matplotlib inline

    threshold = 0.3
    visual_size = 512
    colors = [[255, 0, 0], [0,255,0], [0,0,255], [255,0,255], [255,255,0]]
    
    def bbox_to_contour(bbox):
        contour = [
            [bbox[0], bbox[1]],
            [bbox[2], bbox[1]],
            [bbox[2], bbox[3]],
            [bbox[0], bbox[3]],
            [bbox[0], bbox[1]],
        ]
        contour = np.array(contour, dtype=int)
        return contour

    annotations = json.load(open(anno_path, 'r'))
    images = {v['id']: v['file_name'] for v in annotations['images']}
    
    preds = {k: [] for k in images.keys()}
    scores = {k: [] for k in images.keys()}
    for out_pkl in out_pkls:
        predictions = pickle.load(open(out_pkl, 'rb'))
        for i, pred in enumerate(predictions):
            preds[i].append(pred[0][0][:, :4])
            scores[i].append(pred[0][0][:, -1])

    for i in images.keys():
        image_path = os.path.join(jpg_dir, images[i])
        image_org = cv2.imread(image_path)
        image = cv2.resize(image_org, (visual_size, visual_size))
        
        # raw results
        for k, (pred, score) in enumerate(zip(preds[i], scores[i])):
            pred = (np.array(pred, dtype=float) * visual_size / image_size).astype(int)
            score = np.array(score)
            for pr, sc in zip(pred[score > threshold], score[score > threshold]):
                cv2.drawContours(image, [bbox_to_contour(pr)], -1, colors[k], 3)
                cv2.putText(image, f'{sc:.4f}', pr[:2] - np.array([0, 10]), cv2.FONT_HERSHEY_SIMPLEX, 1, colors[k], 2, cv2.LINE_AA)

        clear_output(wait=True)
        plt.imshow(image)
        plt.show()
        
        # ensemble
        iou_thr = 0.5
        skip_box_thr = 0.0001
        
        boxes_list = [x / image_size for x in preds[i]]
        scores_list = [x for x in scores[i]]
        labels_list = [[0] * len(x) for x in scores[i]]
        weights = [1 for x in scores[i]]

        boxes_ens, scores_ens, _ = ensemble_boxes.weighted_boxes_fusion(boxes_list, 
                                                                        scores_list, 
                                                                        labels_list, 
                                                                        weights=weights, 
                                                                        iou_thr=iou_thr, 
                                                                        skip_box_thr=skip_box_thr,
                                                                        conf_type='max')
        image = cv2.resize(image_org, (visual_size, visual_size))
        pred = (np.array(boxes_ens, dtype=float) * visual_size).astype(int)
        score = np.array(scores_ens)

        for pr, sc in zip(pred[score > threshold], score[score > threshold]):
            cv2.drawContours(image, [bbox_to_contour(pr)], -1, colors[0], 3)
            cv2.putText(image, f'{sc:.4f}', pr[:2] - np.array([0, 10]), cv2.FONT_HERSHEY_SIMPLEX, 1, colors[0], 2, cv2.LINE_AA)
        plt.imshow(image)
        plt.show()
        time.sleep(1)

# Post-processing

In [None]:
# Parse result pickles
annotations = json.load(open(anno_path, 'r'))
images = {v['id']: v['file_name'].split('.')[0] for v in annotations['images']}

bboxes = {k: [] for k in images.values()}
scores = {k: [] for k in images.values()}
for out_pkl in out_pkls:
    predictions = pickle.load(open(out_pkl, 'rb'))
    for i, pred in enumerate(predictions):
        image_name = images[i]
        bboxes[image_name].append(pred[0][0][:, :4])
        scores[image_name].append(pred[0][0][:, -1])

In [None]:
# Ensemble
ensemble = 'wbf'

if ensemble == 'wbf':
    bboxes_final, scores_final = {}, {}
    for k in images.values():
        bboxes_ = [x / image_size for x in bboxes[k]]
        scores_ = scores[k]
        labels_ = [[0] * len(x) for x in scores[k]]
        weights_ = [1 for x in scores[k]]

        bboxes_, scores_, _ = ensemble_boxes.weighted_boxes_fusion(bboxes_,
                                                                   scores_,
                                                                   labels_,
                                                                   weights=weights_,
                                                                   iou_thr=0.5,
                                                                   skip_box_thr= 0.0001,
                                                                   )
        bboxes_final[k] = bboxes_ * image_size
        scores_final[k] = scores_
        
elif ensemble == 'concat':
    bboxes_final, scores_final = {}, {}
    for k in images.values():
        bboxes_final[k] = np.concatenate(bboxes[k])
        scores_final[k] = np.concatenate(scores[k])
        
else:
    raise

In [None]:
# Make submission file
submission = []
for idx, row in test_df.iterrows():
    image_id = row['id']
    width = row['width']
    height = row['height']
    
    bbox = bboxes_final[image_id]
    score = scores_final[image_id]
    
    bbox[:, 0] = bbox[:, 0] * width / image_size
    bbox[:, 1] = bbox[:, 1] * height / image_size
    bbox[:, 2] = bbox[:, 2] * width / image_size
    bbox[:, 3] = bbox[:, 3] * height / image_size
    bbox = bbox.astype(int)
    
    pred_str = ' '.join(['opacity {:f} {:d} {:d} {:d} {:d}'.format(s, *b) for s, b in zip(score, bbox)])
    case = {'Id': image_id + '_image',
            'PredictionString': pred_str}
    submission.append(case)

for idx, row in study_sub.iterrows():
    case = {'Id': row['id'], 
            'PredictionString': ''}
    submission.append(case)

submission_df = pd.DataFrame(submission)
submission_df

In [None]:
%cd /kaggle/working
submission_df.to_csv('submission.csv', index=False)

In [None]:
!rm -rf data output