In [None]:
!conda install '/kaggle/input/pydicom-conda-helper/libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/gdcm-2.8.9-py37h500ead1_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/conda-4.10.1-py37h89c1867_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/certifi-2020.12.5-py37h89c1867_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/openssl-1.1.1k-h7f98852_0.tar.bz2' -c conda-forge -y

In [None]:
import os
import ast
import cv2
import pandas as pd
import numpy as np
import pydicom as dicom
from tqdm import tqdm
from glob import glob
from PIL import Image
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    return im


def robust_read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom_ = dicom.read_file(path)

    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom_.pixel_array, dicom_)
    else:
        data = dicom_.pixel_array
    
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom_.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    # 近似3个标准差内数据
    q1,q2,q3 = np.quantile(data, [.15865,.5,.84135]) 
    iqr = q3 - q1
    multiplier = 1.5
    # 原始方法:http://www.kaggle.com/yukiszk/robust-pixel-array-scaling

    mask = ((q2 - multiplier * iqr) < data) & (data < (q2 + multiplier * iqr))
    
    if data[mask].size != 0:
        p = .001
        data = data.astype(np.float32) - np.quantile(data[mask], p)
        data = data / np.quantile(data[mask], 1-p)
    else:
        data = data - np.min(data)
        data = data / np.max(data)

    data = np.clip(data, 0, 1)
    data = (data * 255).astype(np.uint8)
    # 3-channel
#     img_equ = cv2.equalizeHist(data)
#     img_edge = cv2.Canny(img_equ, 50, 130)
#     img = np.concatenate([
#         data[:, :, None],
#         img_equ[:,:,None],
#         img_edge[:,:,None],
#     ], axis=-1)
    
    return data

In [None]:
splits = ['study', 'image']
remove_id = [
    '61f3ac249c50',
    'a39667fe9a81',
    '267a250932bc',
    'b97c6b32105e',
    '869476b0763a',
    '49664f078f0e',
    'c3a09e8a600d',
    '35e398a5a431',
    '0bd6cd815ba9',
    '6f54e9cbd180',
    'c4b68b29a072',
    '9872a8a48f23',
    'e738c549fe8e',
    'c636ac67c19a',
    '8d4b3609ed92',
]
img_size = 1024
#os.makedirs('/kaggle/working/train-csv/', exist_ok=True)

for split in splits:
    save_dir = f'/kaggle/tmp/{split}/'
    os.makedirs(save_dir, exist_ok=True)
    if split == 'image':   
        df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv')
        none = []
        for _, row in df.iterrows():
            study_id = row['StudyInstanceUID']
            image_id = row['id'].split('_')[0]
            if image_id not in remove_id:
                img_path = glob(f'/kaggle/input/siim-covid19-detection/train/{study_id}/*/{image_id}.dcm')
                xray = robust_read_xray(img_path[0])
                #h, w = xray.shape
                img = resize(xray, size=img_size)
                img.save(f'{save_dir}/{image_id}_image.png')
            #if 'none' in str(row['label']):
            #    none.append('1')
            #else:
            #    none.append('0')
        #df['none'] = none
        #df.to_csv('/kaggle/working/train-csv/train.csv')
    elif split == 'study':
        df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_study_level.csv')
        for _, row in df.iterrows():
            study_id = row['id'].split('_')[0]
            img_path = glob(f'/kaggle/input/siim-covid19-detection/train/{study_id}/*/*.dcm')
            for img_p in img_path:
                xray = robust_read_xray(img_p)
                #h, w = xray.shape
                img = resize(xray, size=img_size)
                img.save(f'{save_dir}/{study_id}_study.png')

In [None]:
!tar -zcf data.tar.gz -C "/kaggle/tmp/" .