I belive that standardize the dataset could help to increase the accuracy of any train so I wrote this script to crop the x-ray images, I also prepared a bboxes coordinate shifting script.

[Cropped and Resized Images Dataset](https://www.kaggle.com/anhlv2312/vinbigdata-xray-cropped-512)

![](https://i.ibb.co/fk3xT47/Screenshot-from-2021-01-13-00-21-24.png)


In [None]:
import numpy as np
import pandas as pd
import os
import sys
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from skimage import exposure
from skimage import transform
import warnings
from tqdm.notebook import tqdm
import imageio

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

GRAY_SCALE = 1

# Some constants
HIP_THRESHOLD = [0.3, 0.4]
NECK_THRESHOLD = 0.2
LUNG_THRESHOLD = 0.7
OUTPUT_DIM = 1024

dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection/'

# Read dicom images into numpy array (https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way)
def read_xray(path, voi_lut=True, monochrome=True, normalize=True):
    dicom = pydicom.read_file(path)

    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array

    if monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data

    if normalize:
        data = exposure.equalize_hist(data)
        
    data = data - np.min(data)
    data = data / np.max(data)

    return data
  

# Calculate the margin of the chest
def calculate_margins(data):
    height = data.shape[0]
    width = data.shape[1]

    # Get the median brighness value along with x and y axis
    x_scale = np.median(data, axis=1)
    y_scale = np.median(data, axis=0)

    # Initialise default margin value [top, bottom, left, right]
    margins = [0, height, 0, width]

    # Calculate the left margin
    if y_scale[0] < HIP_THRESHOLD[0]:
        for i in range(width // 3):
            if y_scale[i] < HIP_THRESHOLD[1] * GRAY_SCALE < y_scale[i + 1]:
                margins[2] = i
                break

    # Calculate the left margin 
    if y_scale[-1] < HIP_THRESHOLD[0]:
        for i in range(width - 1, width // 3 * 2, -1):
            if y_scale[i] < HIP_THRESHOLD[1] * GRAY_SCALE < y_scale[i - 1]:
                margins[3] = i
                break
    
    # Calculate the top magin, looping until the median brighness reach the neck threshold
    for i in range((margins[3]-margins[2]) // 2):
            if x_scale[i] < NECK_THRESHOLD * GRAY_SCALE < x_scale[i + 1]:
                margins[0] = i
                break
   
    # Crop out the left hip and right hip to get the middle part
    middle_part = data[:, margins[2]:margins[3]]
    
    # Initialise the bottom margin for left lung and right lung
    margin_bottom_left = margins[0]
    margin_bottom_right = margins[0]

    # Crop out the lungs
    left_lung = middle_part[:, (margins[3]-margins[2])//6:(margins[3]-margins[2])//6*3]
    right_lung = middle_part[:, (margins[3]-margins[2])//6*3:(margins[3]-margins[2])//6*5]
    
    # Normalise the brightneess
    left_lung = exposure.equalize_hist(left_lung)
    right_lung = exposure.equalize_hist(right_lung)
    
    # Calculate the median brightness
    x_scale_left = np.median(left_lung, axis=1)
    x_scale_right = np.median(right_lung, axis=1)

    # Calculate the margin based on the brightness of the left lung
    for i in range(margins[0]+(margins[3]-margins[2])//2, height - 1):
        if x_scale_left[i] < LUNG_THRESHOLD * GRAY_SCALE < x_scale_left[i + 1]:
            margin_bottom_left = i
            break

    # Calculate the margin based on the brightness of the right lung
    for i in range(margins[0]+(margins[3]-margins[2])//2, height - 1):
        if x_scale_right[i] < LUNG_THRESHOLD * GRAY_SCALE < x_scale_right[i + 1]:
            margin_bottom_right = i
            break

    # Set the bottom margin max of left lung and right lung
    margins[1] = max(margin_bottom_left, margin_bottom_right)
    # If the bottom margin is higher than the height of square then scrop at the part that make the image a square
    margins[1] = max(margins[1], margins[0] + (margins[3] - margins[2]))
    # if the square is higher than the original image then margin bottom is set to height of the original image
    margins[1] = min(margins[1], height)

    # Crop out the chest
    cropped = middle_part[margins[0]:margins[1], :]
    # Normalise the image last time    
    cropped = exposure.equalize_hist(cropped)

    
    return margins, cropped


In [None]:
def show_xray_data_cropped(image_id, data, cropped):
    fig=plt.figure(figsize=(15, 3))
    fig.suptitle(image_id, y=0)
    
    fig.add_subplot(1, 4, 1) 
    plt.imshow(data, cmap=plt.cm.bone)
    plt.title('Original')
 
    fig.add_subplot(1, 4, 2)
    plt.imshow(cropped, cmap=plt.cm.bone)
    plt.title('Cropped')
    
    fig.add_subplot(1, 4, 3)
    plt.title('Brightness Scale by X')
    plt.ylim([0, GRAY_SCALE])
    plt.plot(np.median(data, axis=1))
    plt.plot(data.mean(1))
    
    fig.add_subplot(1, 4, 4)
    plt.title('Brightness Scale by Y')
    plt.ylim([0, GRAY_SCALE])
    plt.plot(np.median(data, axis=0))
    plt.plot(data.mean(0))
    
    plt.show()

In [None]:
df = pd.read_csv(dataset_dir + 'train.csv')
df = df[df['class_id'] != 14]
image_ids = df['image_id'].unique()

samples = []
# samples.extend(['363dc405e14ed95659d88707f54730de','414ae85a6ec97db19ed913bde0062b11','ef63342a9d28339d09338c16573066c6'])
# samples.extend(['a398135fec0dd0d8239d5b6d8d24454b','cefc63f9ff49d9da82c49144f05a13cd','2be0cff9073424bcaf946885d1c1adf5'])
# samples.extend(['cb9658d61c84a99ba31665f40cb0788d','4b33db392748079f75a5250a15840b74','1d8f4d5daf11f2b01695b71a862aa813'])
# samples.extend(['d16b67fee07971da41a3d08707ccd864','66396f621903b00a1b7e1f54c8e5e8b3','eebb4b0a4472b69b72b1004b5b4bfcaa'])
# samples.extend(['6c79f2551808438721052023e043ab4d'])
samples.extend(np.random.choice(image_ids, 10))
    
for image_id in samples:
    data = read_xray(os.path.join(dataset_dir, 'train', image_id + '.dicom'))
    margins, cropped = calculate_margins(data)
    show_xray_data_cropped(image_id, data, cropped)
    

In [None]:
def generate_output(image_ids, input_dir, ouput_dir_name):

    os.makedirs(ouput_dir_name, exist_ok=True)
    dimensions = {}
    margins = {}

    for image_id in tqdm(image_ids):

        data = read_xray(os.path.join(input_dir, image_id + '.dicom'))

        dimensions[image_id] = data.shape

        margins[image_id], cropped = calculate_margins(data)
        resized = transform.resize(cropped, [OUTPUT_DIM, OUTPUT_DIM])
        output = (resized * 255).astype(np.uint8)
        
        imageio.imwrite(os.path.join(ouput_dir_name, image_id + '.png'), output)

    output_df = pd.DataFrame(image_ids, columns=['image_id'])
        
    output_df['dimensions'] = output_df.apply(lambda row: dimensions.get(row['image_id']), axis=1)
    output_df['margins'] = output_df.apply(lambda row: margins.get(row['image_id']), axis=1)
    
    output_df.to_csv(ouput_dir_name + '.csv')
    print(output_df)
    

In [None]:
def shift_bboxes():
    
    train = pd.read_csv(os.path.join(dataset_dir, 'train.csv'))
    cropped = pd.read_csv('../input/vinbigdata-xray-cropped-512/train.cropped.csv', index_col='image_id')
    train = pd.merge(train, cropped, on='image_id', how='left')

    print(train)

    train['x_min'] = train['x_min'] - train['left']
    train['x_max'] = train['x_max'] - train['left']

    train['y_min'] = train['y_min'] - train['top']
    train['y_max'] = train['y_max'] - train['top']

    train['x_min'] = train.apply(lambda row: 0 if row.x_min < 0 else row.x_min, axis=1)
    train['y_min'] = train.apply(lambda row: 0 if row.y_min < 0 else row.y_min, axis=1)

    train['x_max'] = train.apply(lambda row: row.right - row.left if row.x_max > row.right - row.left else row.x_max, axis=1)
    train['y_max'] = train.apply(lambda row: row.bottom - row.top if row.y_max > row.bottom - row.top else row.y_max, axis=1)

    print(train)
    train.to_csv('train.shifted.csv', index=None)

In [None]:
# train_df = pd.read_csv(dataset_dir + 'train.csv')
# train_df = train_df[train_df['class_id'] != 14]
# train_ids = train_df['image_id'].unique()
# generate_output(train_ids, os.path.join(dataset_dir,'train'), 'train_cropped_' + str(OUTPUT_DIM))

In [None]:
# test_df = pd.read_csv(dataset_dir + 'sample_submission.csv')
# test_ids = test_df['image_id'].unique()
# generate_output(test_ids, os.path.join(dataset_dir,'test'), 'test_cropped_' + str(OUTPUT_DIM))