source: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image

In [None]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
def get_xray(path):
    dicom = pydicom.read_file(path)
    return dicom

def xray_to_nparray(dicom,voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array

    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

def nparray_to_img(array, size = None, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    if size == None:
        return im
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

def xray_to_img(path,size = None,keep_ratio=False,voi_lut = True, fix_monochrome = True, resample=Image.LANCZOS):
    dicom = get_xray(path)
    data = xray_to_nparray(dicom,voi_lut=voi_lut,fix_monochrome=fix_monochrome)
    img = nparray_to_img(data,size = size, keep_ratio=keep_ratio, resample=resample)
    return img

In [None]:
# process csv
def split_train_csv(load_path,save_dir):
    data = pd.read_csv(load_path)
    #split train to each class
    class_ids = data['class_id'].unique()
    class_ids.sort()
    os.makedirs(save_dir,exist_ok=True)
    file_names = []
    for class_id in tqdm(class_ids):
        file_name = 'train_{:02d}.csv'.format(class_id)
        file_names.append(file_name)
        data[data['class_id']==class_id].reset_index().to_csv(os.path.join(save_dir,file_name))
    return save_dir, file_names

In [None]:
# Process dicom
def get_raw_data_by_class(df,class_id,load_dir,save_dir = None):
    file_names = df[df['class_id']==class_id]['image_id'].unique()    
    data = []
    for file_name in tqdm(file_names):
        path_dicom = os.path.join(load_dir,file_name+'.dicom')
        xray = get_xray(path_dicom).pixel_array
        if save_dir is not None:
            os.makedirs(save_dir,exist_ok=True)
            path_npy = os.path.join(save_dir,file_name+'.npy')
            if os.path.isfile(path_npy) is False:
                np.save(path_npy,xray)
        data.append([file_name,xray])        
    return data
def get_png_by_class(df,class_id,load_dir,save_dir = None):
    file_names = df[df['class_id']==class_id]['image_id'].unique()
    data = []
    for file_name in tqdm(file_names):
        path_dicom = os.path.join(load_dir,file_name+'.dicom')
        img = xray_to_img(path_dicom)
        if save_dir is not None:
            os.makedirs(save_dir,exist_ok=True)
            path_png = os.path.join(save_dir,file_name+'.png')
            if os.path.isfile(path_png) is False:
                img.save(path_png)
        data.append([file_name,img])        
    return data

In [None]:
from shutil import copyfile
def copy_data_by_class(df,class_id,load_dir,save_dir):
    file_names = df[df['class_id']==class_id]['image_id'].unique()
    os.makedirs(save_dir,exist_ok=True)
    for file_name in tqdm(file_names):
        copy_dicom = os.path.join(load_dir,file_name+'.dicom')
        paste_dicom = os.path.join(save_dir,file_name+'.dicom')
        copyfile(copy_dicom,paste_dicom)

In [None]:
data_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/'
train_csv_path = os.path.join(data_dir,'train.csv')
train_folder_path = os.path.join(data_dir,'train')
split_train_csv_path = './train_csv'

In [None]:
csv_folder, csv_names= split_train_csv(train_csv_path,split_train_csv_path)

In [None]:
class_14 = pd.read_csv(os.path.join(csv_folder,csv_names[14]))
class_14.describe()

In [None]:
class_12 = pd.read_csv(os.path.join(csv_folder,csv_names[12]))
class_12.describe()

In [None]:
# get number image as class_12
df_class_14 = class_14[class_14['image_id'].isin(class_14['image_id'].unique()[:len(class_12['image_id'].unique())])]

In [None]:
df_class_14.to_csv('./train_csv/class_14_12')

In [None]:
get_raw_data_by_class(df_class_14,14,train_folder_path,'./train/npy_class_14_12')

In [None]:
get_png_by_class(df_class_14,14,train_folder_path,'./train/png_class_14_12')

In [None]:
get_raw_data_by_class(class_12,12,train_folder_path,'./train/npy_class_12')

In [None]:
get_png_by_class(class_12,12,train_folder_path,'./train/png_class_12')

In [None]:
copy_data_by_class(class_12,12,train_folder_path,'./train_dicom_12')

In [None]:
copy_data_by_class(df_class_14,14,train_folder_path,'./train_dicom_14')

In [None]:
%%time
!zip -r train_csv.zip ./train_csv

In [None]:
!zip -r png_class_14_12.zip ./train/png_class_14_12

In [None]:
!zip -r npy_class_14_12.zip ./train/npy_class_14_12

In [None]:
!zip -r dicom_class_12.zip ./train_dicom_12

In [None]:
!zip -r dicom_class_14_12.zip ./train_dicom_14