In [1]:
import pandas as pd
import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
from sklearn import model_selection
from tqdm import tqdm
import shutil
import os

In [2]:
# loading train images & csv file
path_train_images = '/kaggle/input/cxr-512-grayimages/transformed_data/train'

train_df = pd.read_csv('/kaggle/input/cxr-512-grayimages/transformed_data/train/transformed_train.csv')
train_df.head()

# RENAME/SWAP ORIGINAL WIDTH & HEIGHT COLS

Unnamed: 0,image_id,x_min,y_min,x_max,y_max,class_id,class_name,rad_id,original_width,original_height,transformed_width,transformed_height
0,50a418190bc3fb1ef1633bf9678929b3,0.0,0.0,0.219554,0.21938,14,No finding,R11,2580,2332,512,566
1,50a418190bc3fb1ef1633bf9678929b3,0.0,0.0,0.219554,0.21938,14,No finding,R15,2580,2332,512,566
2,50a418190bc3fb1ef1633bf9678929b3,0.0,0.0,0.219554,0.21938,14,No finding,R16,2580,2332,512,566
3,21a10246a5ec7af151081d0cd6d65dc9,0.0,0.0,0.173324,0.173473,14,No finding,R7,3159,2954,512,548
4,21a10246a5ec7af151081d0cd6d65dc9,0.0,0.0,0.173324,0.173473,14,No finding,R13,3159,2954,512,548


In [3]:
# Checking unique image ids in csv & image folder
unique_img_ids = train_df['image_id'].unique()
print(f"Total number of unique image ids in csv: {len(unique_img_ids)}")


jpegCounter = len(glob.glob1(path_train_images,"*.jpeg"))
print(f"Total number of jpeg image in dataset folder: {jpegCounter}")

Total number of unique image ids in csv: 15000
Total number of jpeg image in dataset folder: 15000


In [4]:
# select only those rows which have bounding boxes
finding_df = train_df[train_df['class_name'] != 'No finding']
finding_df.head()

Unnamed: 0,image_id,x_min,y_min,x_max,y_max,class_id,class_name,rad_id,original_width,original_height,transformed_width,transformed_height
6,9a5094b2563a1ef3ff50dc5c7ff71345,170.092308,338.452483,406.892308,450.695634,3,Cardiomegaly,R10,2336,2080,512,575
7,9a5094b2563a1ef3ff50dc5c7ff71345,440.369231,425.588613,461.538462,490.325342,10,Pleural effusion,R9,2336,2080,512,575
8,9a5094b2563a1ef3ff50dc5c7ff71345,440.369231,425.588613,461.538462,490.325342,11,Pleural thickening,R9,2336,2080,512,575
9,9a5094b2563a1ef3ff50dc5c7ff71345,170.338462,338.452483,407.876923,442.818921,3,Cardiomegaly,R9,2336,2080,512,575
10,9a5094b2563a1ef3ff50dc5c7ff71345,169.6,323.191353,410.092308,433.95762,3,Cardiomegaly,R8,2336,2080,512,575


In [5]:
finding_df_img_ids = finding_df['image_id'].unique()
len(finding_df_img_ids)

4394

In [6]:
# checking size of 1st image in df
image = Image.open(f"{path_train_images}/9a5094b2563a1ef3ff50dc5c7ff71345.jpeg")

width, height = image.size

print(f"Width = {width} and hieght = {height} of image 9a5094b2563a1ef3ff50dc5c7ff71345.jpeg")

Width = 512 and hieght = 575 of image 9a5094b2563a1ef3ff50dc5c7ff71345.jpeg


In [7]:
# # creating a toy df containing 100 BBs
# small_df = finding_df.head(100)

# print(small_df['image_id'].unique())     # unique images = 11
# small_df.head()

In [8]:
small_df = finding_df
small_df.head()
len(small_df)

36096

# Plotting before & after

In [9]:
label2color = { 0:("Aortic enlargement","#2a52be"),
                1:("Atelectasis","#ffa812"),
                2:("Calcification","#ff8243"),
                3:("Cardiomegaly","#4682b4"),
                4:("Consolidation","#ddadaf"),
                5:("ILD","#a3c1ad"),
                6:("Infiltration","#008000"),
                7:("Lung Opacity","#004953"),
                8:("Nodule/Mass","#e3a857"),
                9:("Other lesion","#dda0dd"),
               10:("Pleural effusion","#e6e8fa"),
               11:("Pleural thickening","#800020"),
               12:("Pneumothorax","#918151"),
               13:("Pulmonary fibrosis","#e75480"),
               14:("No finding", "#ffffff")
              }

In [10]:
# code to plot image with bounding boxes for fusion comparison

def bounding_box_plotter_before_after(img_as_arr, img_id, bounding_boxes_before, bounding_boxes_after):
        
    fig, (ax1, ax2) = plt.subplots(1, 2)
    
    for bb, img, ax, title in zip([bounding_boxes_before, bounding_boxes_after], 
                                  [img_as_arr, img_as_arr], [ax1, ax2], ["Before", "After"]):
        
        # add the bounding boxes
        for row in bb:
            # each row contains 'x_min', 'y_min', 'x_max', 'y_max', "class_id"
            xmin = row[0]
            xmax = row[2]
            ymin = row[1]
            ymax = row[3]

            width = xmax - xmin
            height = ymax - ymin

            # assign different color to different classes of objects
            edgecolor = label2color[row[4]][1]
            ax.annotate(label2color[row[4]][0], xy=(xmax - 40, ymin + 20))
            
            # add radiologist if Before
            label_bb = str(label2color[row[4]][0])+"::"+str(row[5]) if title=="Before" else str(label2color[row[4]][0])
            
            # add bounding boxes to the image
            rect = patches.Rectangle((xmin, ymin), width, height, edgecolor=edgecolor, facecolor='none', label=label_bb)
            
            ax.add_patch(rect)
            ax.legend()
        
        # plot the image
        ax.imshow(img_as_arr, cmap="gray")
        ax.set_title(title+"::"+img_id)
    
    fig.set_size_inches(33,24)
    plt.show()

In [11]:
def get_bb_info(df, img_id, columns):
    bounding_boxes_info = df.loc[df["image_id"]==img_id, columns]

    bboxes = []
    for _, row in bounding_boxes_info.iterrows():
        bboxes.append(list(row))
    
    return bboxes

In [12]:
# for img_id in small_df['image_id'].unique():    

#     bounding_boxes_info_before = get_bb_info(small_df, img_id, ['x_min', 'y_min', 'x_max', 'y_max', "class_id", "rad_id"])
#     bounding_boxes_info_after = get_bb_info(small_df, img_id, ['x_min', 'y_min', 'x_max', 'y_max', "class_id"])

#     # read image as array
#     im = Image.open(path_train_images+f"/{img_id}.jpeg")
#     bounding_box_plotter_before_after(im, img_id, bounding_boxes_info_before, bounding_boxes_info_after)

# Cloning Yolov5 repo

In [13]:
!pwd

/kaggle/working


In [14]:
!git clone https://github.com/ultralytics/yolov5.git

Cloning into 'yolov5'...
remote: Enumerating objects: 5437, done.[K
remote: Total 5437 (delta 0), reused 0 (delta 0), pack-reused 5437[K
Receiving objects: 100% (5437/5437), 8.05 MiB | 17.22 MiB/s, done.
Resolving deltas: 100% (3718/3718), done.


# Creating dir structure for YOLOv5

In [15]:
# adding BB width & height columns to small_df
small_df['bb_width'] = small_df['x_max'] - small_df['x_min']
small_df['bb_height'] = small_df['y_max'] - small_df['y_min']


# adding a column that contains BB info in list format
small_df['bbox_list']= small_df[['class_id', 'x_min', 'y_min', 'bb_width', 'bb_height']].values.tolist()


# creating a list of lists for all BB per image
small_df_2cols = small_df.groupby('image_id')['bbox_list'].apply(list).reset_index()


# giving col names to newly created df
small_df_2cols.columns =['image_id', 'bboxes']


# adding 2 new cols to df
small_df_2cols["image_width"] = np.nan
small_df_2cols["image_height"] = np.nan

# adding values to image_width & image_height cols from original df
for image_id in small_df_2cols['image_id']:
    small_df_2cols.loc[small_df_2cols.image_id==image_id,["image_width", "image_height"]] \
        =finding_df[finding_df.image_id==image_id].iloc[0]['transformed_width'], finding_df[finding_df.image_id==image_id].iloc[0]['transformed_height']


# splitting data into train & val
df_train, df_valid = model_selection.train_test_split(
    small_df_2cols,
    test_size = 0.1,        # 10% for validation
    random_state = 42,
    shuffle = True
)


# resetting index values in df
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [16]:
small_df_2cols.head()

Unnamed: 0,image_id,bboxes,image_width,image_height
0,0005e8e3701dfb1dd93d53e2ff537b6e,"[[7.0, 150.0, 97.83333333333331, 50.8333333333...",512.0,512.0
1,0007d316f756b3fa0baea2ff514ce945,"[[13.0, 309.3333333333333, 156.0, 28.0, 19.555...",512.0,640.0
2,000d68e42b71d3eac10ccc077aba07c1,"[[9.0, 9.555555555555557, 0.0, 186.44444444444...",512.0,640.0
3,00150343289f317a0ad5629d5b7d9ef9,"[[11.0, 51.80594059405941, 257.4290271132376, ...",515.0,512.0
4,001d127bad87592efe45a5c7678f8b8d,"[[13.0, 340.3333333333333, 105.16666666666669,...",512.0,512.0


In [17]:
!mkdir CXR_data

!mkdir CXR_data/images
!mkdir CXR_data/images/train
!mkdir CXR_data/images/validation

!mkdir CXR_data/labels
!mkdir CXR_data/labels/train
!mkdir CXR_data/labels/validation


In [18]:
# Creating dir structure that is required for YOLO model
OUTPUT_PATH = '/kaggle/working/CXR_data'

def process_data(data, data_type = 'train'):
    for _, row in tqdm(data.iterrows(), total=len(data)):
        image_name = row['image_id']
        bounding_boxes = row['bboxes']
        image_width = row['image_width']
        image_height = row['image_height']

        yolo_data = []
        for bbox in bounding_boxes:
            class_id = bbox[0]
            x = bbox[1]
            y = bbox[2]
            w = bbox[3]
            h = bbox[4]

            x_center = x+w/2
            y_center = y+h/2

            # yolo required all coordinates to lie between 0-1
            x_center /= image_width
            y_center /= image_height
            w /= image_width        # dividing bb width by image width
            h /= image_height

            # add BB info to list
            yolo_data.append([class_id, x_center, y_center, w, h])

        # converting list to np array
        yolo_data = np.array(yolo_data)
        np.savetxt(
            f"{OUTPUT_PATH}/labels/{data_type}/{image_name}.txt",
            yolo_data,
            fmt=["%d", "%f", "%f", "%f", "%f"]      # formatting for class_id, x_center, y_center, width & height of BB
                )

        # copying image files from source to dest
        shutil.copyfile(
            f"{path_train_images}/{image_name}.jpeg",
            f"{OUTPUT_PATH}/images/{data_type}/{image_name}.jpeg"
        )



# ------------------------------------------
process_data(df_train, data_type = 'train')
process_data(df_valid, data_type = 'validation')

# ------------------------------------------
# Once cxr.yaml file is created, we proceed to training the model


100%|██████████| 3954/3954 [00:37<00:00, 104.52it/s]
100%|██████████| 440/440 [00:04<00:00, 96.81it/s] 


# Requirements for yolov5

In [19]:
!pip install -r /kaggle/working/yolov5/requirements.txt

Collecting thop
  Downloading thop-0.0.31.post2005241907-py3-none-any.whl (8.7 kB)
Collecting pycocotools>=2.0
  Downloading pycocotools-2.0.2.tar.gz (23 kB)
Building wheels for collected packages: pycocotools
  Building wheel for pycocotools (setup.py) ... [?25l- \ | / - \ | / done
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.2-cp37-cp37m-linux_x86_64.whl size=272653 sha256=0e85d707f2f13c7f96faac646c05a4f452157a15c3c62821499882ba83800e2e
  Stored in directory: /root/.cache/pip/wheels/bc/cf/1b/e95c99c5f9d1648be3f500ca55e7ce55f24818b0f48336adaf
Successfully built pycocotools
Installing collected packages: thop, pycocotools
Successfully installed pycocotools-2.0.2 thop-0.0.31.post2005241907


In [20]:
!pwd
os.chdir('/kaggle/working/yolov5')
!pwd

/kaggle/working
/kaggle/working/yolov5


In [21]:
# copying yaml file to yolov5 folder
shutil.copyfile('/kaggle/input/yaml-file/cxr_kaggle.yaml', '/kaggle/working/yolov5/cxr_kaggle.yaml')

'/kaggle/working/yolov5/cxr_kaggle.yaml'

###  


In [22]:
# weights & = dryrun saves the metrics locally so no syncing with cloud required
# no pretrained model used so randomly initialized weights used
!WANDB_MODE="dryrun" python train.py --batch 2 --epochs 10 --data cxr_kaggle.yaml --cfg models/yolov5s.yaml --name cxr_model --weights ''

[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
2021-03-16 19:18:34.578292: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-03-16 19:18:45.494549: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-03-16 19:18:45.497058: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
[34m[1mwandb[0m: Offline run mode, not syncing to the cloud.
[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` to enable cloud syncing.
[34m[1mtrain: [0mScanning '/kaggle/working/CXR_data/labels/train' images and labels... 3954 found, 0 missing, 0 empty, 0 corrupted: 100%|██████████| 3954/3954 [00:01<00:00, 2856.00it/s]
[34m[1mval: [0mScanning '/kaggle/working/CXR_data/labels/validation' images and labels... 440 found, 0 mis

In [23]:
# !python detect.py --weights best.pt --source /kaggle/working/CXR_data/images/validation