In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pydicom as dicom

In [None]:
train_df=pd.read_csv('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
class_name=list(np.unique(train_df['class_name']))
print(f'The total number of classes is {len(class_name)}')
print('Five random rows of training dataframe train_df are shown below. ')
train_df.sample(5)

In [None]:
class_sample_counts=train_df.groupby(['class_name','class_id']).size().reset_index().rename(columns={0:'count'})
class_sample_counts.sort_values(by='class_id',inplace=True)
class_sample_counts

In [None]:
plt.figure(figsize=(15,5))
sns.set(font_scale = 1.5)
g=sns.barplot(x='class_name',y='count',data=class_sample_counts)
g=g.set_xticklabels(g.get_xticklabels(), rotation=90)

# Preprocessing data

In [None]:
train_df_abnormal=train_df[train_df['class_id']!=14].reset_index(drop=True)
train_df_abnormal

In [None]:
train_df_abnormal['height']=train_df_abnormal['x_max']-train_df_abnormal['x_min']
train_df_abnormal['width']=train_df_abnormal['y_max']-train_df_abnormal['y_min']
train_df_abnormal['area']=train_df_abnormal['width']*train_df_abnormal['height']
train_df_abnormal

In [None]:
class_wise_area=train_df_abnormal.groupby(['class_name','class_id'])['area'].mean().reset_index()
class_wise_area

In [None]:
print(train_df_abnormal['area'].min())
print(train_df_abnormal['area'].max())

In [None]:
plt.figure(figsize=(15,5))
sns.set(font_scale = 1.5)
g=sns.barplot(x='class_name',y='area',data=class_wise_area)
g=g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
train_df_abnormal

In [None]:
#Find wrong annotation (Area size less than size 5000 )
train_df_abnormal_pruned = train_df_abnormal[train_df_abnormal['area'] > 5000].reset_index(drop=True)
train_df_abnormal_pruned.drop(['class_name','rad_id','height','width','area'],axis=1,inplace=True)
train_df_abnormal_pruned

# Preprocessing image 

The Xray images are preprocessed and cropped based on a bounding box detected around lungs. 

In [None]:
from skimage import morphology, io, color, exposure, img_as_float, transform

# Load lung segmentation model

from keras.models import load_model
model_name = '../input/lungsegmentation-model/trained_model.hdf5'
UNet = load_model(model_name) 
#https://github.com/imlab-uiip/lung-segmentation-2d.git

def preprocess_for_segmentation(img,im_shape):
    img = transform.resize(img, im_shape)
    img = exposure.equalize_hist(img)
    img = np.expand_dims(img, -1)
    X = np.array(img)
    y = np.array(img)
    X -= X.mean()
    X /= X.std()
    return X

def preprocessing_pixel_values(dicom_image):
    pixel_data = dicom_image.pixel_array   
    pi = dicom_image['PhotometricInterpretation'].value
    if pi == 'RGB':
        pixel_data = cv2.cvtColor(pixel_data,cv2.COLOR_RGB2GRAY)

    pixel_data = (pixel_data - pixel_data.min()) / (pixel_data.max() - pixel_data.min())
    if pi == 'MONOCHROME1':    
        pixel_data = np.abs(np.max(pixel_data)- pixel_data)
    xray_image=np.asarray(pixel_data*255,np.uint8)    
    return xray_image

def removal_of_white_text(img):
    ret,image=cv2.threshold(img,250,255,cv2.THRESH_TOZERO_INV)
    return image

def find_cropping_area(xray_image,seg_model):
    # inputs: @xray_image preprocessed dicom image @seg_model The segmentation model 
    # returns:@pr prediction of segmentation model 
        #@(x1,y1) left top point of the crop area
        #@(x2,y2) bottom right point of the crop area
    
    
    im_shape = (256, 256) # of the segmentation model 
    width_scale=xray_image.shape[1]/256 # width-scaling factor for resize
    height_scale=xray_image.shape[0]/256 # height-scaling factor for resize
    # The scaling factors are later used to find the cropping region for original image size
    
    img=preprocess_for_segmentation(xray_image,im_shape) # preprocessing the Xray image for segmentation
    inp_shape=img.shape 
    X=np.expand_dims(img, axis=0)
    pred = seg_model.predict(X)[..., 0].reshape(inp_shape[:2]) #predicted segmentation 
    
    # find bounding box around the two lungs
    ret,pr=cv2.threshold(pred,0.95,1,cv2.THRESH_BINARY)
    kernel = np.ones((3, 3), np.uint8) 
    pr=np.array(pr*255,dtype=np.uint8)
    pr = cv2.morphologyEx(pr, cv2.MORPH_OPEN, kernel,iterations = 3)
    pr_canny=cv2.Canny(pr,170,255)
    cnts = cv2.findContours(pr_canny,cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cntsSorted = sorted(cnts[0], key=lambda x: cv2.contourArea(x), reverse=True)
    x_c=[]
    y_c=[]
    b=0
    for i in range(len(cntsSorted)):
            x,y,w,h = cv2.boundingRect(cntsSorted[i])
            x_c.append(x)
            x_c.append(x+w)
            y_c.append(y)
            y_c.append(y+h)
    w=max(x_c)-min(x_c)    
    crp_p1=(max([min(x_c)-w//6,0]),max([min(y_c)-w//10,0]))
    crp_p2=(min([max(x_c)+w//6,pr.shape[0]]),min([max(y_c)+w//5,pr.shape[1]]))
    # the crop area is scaled to the original image size
    x1=int(crp_p1[0]*width_scale)
    y1=int(crp_p1[1]*height_scale)
    x2=int(crp_p2[0]*width_scale)
    y2=int(crp_p2[1]*height_scale)
    pr=cv2.cvtColor(pr,cv2.COLOR_GRAY2RGB)
    
    cv2.rectangle(pr,(min(x_c),min(y_c)),(max(x_c),max(y_c)), (255, 0, 0), 1)
    cv2.rectangle(pr,crp_p1,crp_p2, (0, 255, 0), 2)
    return pr,(x1,y1),(x2,y2)

def crop_image_and_box(xray_image,pt1,pt2):  
    #input @xray_image=input image (output from function preprocessing_pixel_values(dicom_image))
    #output @xray_image_cropped=cropped image
        #@b_pt1=left top point of the bounding box
        #@b_pt2=right left point of the bounding box
        
    xray_image=removal_of_white_text(xray_image)
    pr,c_pt1,c_pt2=find_cropping_area(xray_image,UNet)     
    
    ## Check if the bounding box lies inside crop region
    
    if pt1[0]<c_pt1[0]:  # if x_min of bounding box lies outside crop area
        cropx_min=pt1[0]
    else:
        cropx_min=c_pt1[0]
        
    if pt1[1]<c_pt1[1]: # if y_min of bounding box lies outside crop area
        cropy_min=pt1[1]
    else:
        cropy_min=c_pt1[1]
    nc_pt1=(cropx_min,cropy_min)
   
    if pt2[0]>c_pt2[0]:  # if x_max bottom right corner lies outside crop area
        cropx_max=pt2[0]
    else:
        cropx_max=c_pt2[0]
        
    if pt2[1]>c_pt2[1]: # if y_max bottom right corner lies outside crop area
        cropy_max=pt2[1]
    else:
        cropy_max=c_pt2[1]  
    nc_pt2=(cropx_max,cropy_max)  
    
    width=nc_pt2[0]-nc_pt1[0]
    height=nc_pt2[1]-nc_pt1[1]
    bias=0
    #print(width)
    #print(height)
    if width>height:
        #print('Check')
        bias=(width-height)   
    xray_image_cropped=xray_image[nc_pt1[1]:nc_pt2[1]+bias,nc_pt1[0]:nc_pt2[0]]
    
    box_x_new=pt1[0]-nc_pt1[0]
    box_y_new=pt1[1]-nc_pt1[1]
    box_x1_new=pt2[0]-nc_pt1[0]
    box_y1_new=pt2[1]-nc_pt1[1]       
    b_pt1=(box_x_new,box_y_new)
    b_pt2=(box_x1_new,box_y1_new)
    
    return pr,xray_image_cropped,b_pt1,b_pt2

## Plot the cropped region of randomly selected X-ray

In [None]:

r=np.random.randint(low=0,high=len(train_df_abnormal_pruned),size=1)
#r=[6352]
train_folder='/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train'
dicom_image=dicom.dcmread(os.path.join(train_folder,train_df_abnormal_pruned['image_id'][r[0]])+'.dicom') 
box_x=int(train_df_abnormal['x_min'][r[0]])
box_y=int(train_df_abnormal['y_min'][r[0]])
box_x1=int(train_df_abnormal['x_max'][r[0]])
box_y1=int(train_df_abnormal['y_max'][r[0]])
pt1=(box_x,box_y)
pt2=(box_x1,box_y1)
xray_image=preprocessing_pixel_values(dicom_image)
pr1,xray_image_cropped, pt1_new,pt2_new=crop_image_and_box(xray_image,pt1,pt2)

plt.subplots(1,3,figsize=(20,5))
plt.subplot(1, 3, 1)
plt.title('Original Image:  '+ train_df_abnormal['class_name'][r[0]])
cv2.rectangle(xray_image,pt1,pt2,(0,255,0),10)
plt.imshow(xray_image, cmap='gray')

plt.subplot(1, 3, 2)
plt.title('Segmentation output')
plt.imshow(pr1, cmap='gray')

plt.subplot(1,3,3)
plt.title('Cropped mage with bounding box')
cv2.rectangle(xray_image_cropped,pt1_new,pt2_new, (0, 0, 0), 10)
g=plt.imshow(xray_image_cropped,'gray')
