In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import argparse
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import scipy.io
import scipy.misc
import PIL
from PIL import ImageFont, ImageDraw, Image
import tensorflow as tf
from tensorflow.python.framework.ops import EagerTensor
import pydicom
from tensorflow.keras.models import load_model
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pydicom
from tqdm import tqdm
from shutil import copyfile

%matplotlib inline

> **This Competition we are participating for each family members who has lost their dear ones  due to COVID.** YOLOV5

To Develop this complete model. We are following below steps:

1. Input (DICOM, CSV)
2. Process Raw Input (Standardized Input CSV)
3. Process Raw Image (As per Model YOLO Intake - Image Size, EPOC etc., If required Augment the data as data size is less. 
4. Create Model YOLO Input 
5. Configure YOLO
6. Train YOLO Model 

In [None]:
# Prepare Input Path.
train_study_path = '../input/siim-covid19-detection/train_study_level.csv' 
train_image_path = '../input/siim-covid19-detection/train_image_level.csv'

# As per YOLO directory structure, Read from other user dataset or Create Directory to store resize images,rescale in different train and test directory
train_resize_images = '/kaggle/working/datasetresize/images/train'
val_resize_images = '/kaggle/working/datasetresize/images/val'
train_resize_labels = '/kaggle/working/datasetresize/labels/train'
val_resize_labels = '/kaggle/working/datasetresize/labels/val'

yolo_dir =  '/kaggle/working/yolov5'

os.makedirs(train_resize_images, exist_ok=True)
os.makedirs(val_resize_images, exist_ok=True)
os.makedirs(train_resize_labels, exist_ok=True)
os.makedirs(val_resize_labels, exist_ok=True)
os.makedirs(yolo_dir, exist_ok=True)

# YOLO parameter
TRAIN_PATH = train_resize_images
IMG_SIZE = 512
BATCH_SIZE = 64
EPOCHS = 150

In [None]:
# Read "train_study_level.csv"
train_study_data =  pd.read_csv(train_study_path)
train_study_data = train_study_data.rename(columns = {'Negative for Pneumonia': 'Negative', 'Typical Appearance': 'Typical','Indeterminate Appearance': 'Indeterminate','Atypical Appearance': 'Atypical'})
# Mark the disease type in disease_type column (New) 
train_study_data['disease_type'] = 'Negative'
train_study_data.loc[train_study_data['Typical'] == 1,'disease_type']  = 'Typical'
train_study_data.loc[train_study_data['Indeterminate'] == 1,'disease_type']  = 'Indeterminate'
train_study_data.loc[train_study_data['Atypical'] == 1,'disease_type']  = 'Atypical'

# Read "train_image_level.csv" 
train_image_data =  pd.read_csv(train_image_path)

# Merge both dataset based on Instance ID
train_study_data['StudyInstanceUID'] = train_study_data['id'].apply(lambda x: x.replace('_study', ''))
del train_study_data['id']
train_image_data = train_image_data.merge(train_study_data, on='StudyInstanceUID')

# Encode/Respresent "String" to Numeric Integer for easy calculation
train_image_data['disease_type_id'] = train_image_data['disease_type']
label_encode = preprocessing.LabelEncoder()
label_encode.fit(train_image_data['disease_type_id'])
train_image_data['disease_type_id']=label_encode.transform(train_image_data['disease_type_id'])

train_image_data['id_image'] = train_image_data['id']
train_image_data['id'] = train_image_data['id'].apply(lambda x: x.replace('_image', ''))

#train_image_data.to_csv("train_image_data.csv")

In [None]:
# Load meta.csv file
meta_df = pd.read_csv('../input/siim-covid19-resized-to-512px-jpg/meta.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns = ['id', 'dim0', 'dim1']

# Merge with meta_df
train_image_data = train_image_data.merge(train_meta_df, on='id',how="left")
train_image_data['path'] = train_image_data.apply(lambda row: f'../input/siim-covid19-resized-to-512px-jpg/train/{row.id}.jpg', axis=1)

# Get image level labels
train_image_data['image_level'] = train_image_data.apply(lambda row: row.label.split(' ')[0], axis=1)

train_image_data.loc[train_image_data['image_level'] == 'opacity', 'image_level_id'] = 0
train_image_data.loc[train_image_data['image_level'] == 'none', 'image_level_id'] = 1
train_image_data['image_level_id'] = train_image_data['image_level_id'].apply(np.int64)
#train_image_data['image_level_id'] = labels_image_level

# Write as csv file
train_image_data.to_csv('train_image_data.csv', index=False)

In [None]:
# Create train and validation split.
train_df, valid_df = train_test_split(train_image_data, test_size=0.2, random_state=42, stratify=train_image_data.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

data_split_df = pd.concat([train_df, valid_df]).reset_index(drop=True)
data_split_df.head(5)

In [None]:
print(f'Size of Split dataset (Train VS Val): {len(data_split_df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

In [None]:
# Show visual progress bar and Move the images to relevant split folder in working .
for i in tqdm(range(len(data_split_df))):
    row = data_split_df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'/kaggle/working/datasetresize/images/train/{row.id}.jpg')
    else:
        copyfile(row.path, f'/kaggle/working/datasetresize/images/val/{row.id}.jpg')


In [None]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [None]:
# Write scaled boundary box in .txt for train and val folder in label folder
# In case we have NONE as class we don't need to populate the boundary box .txt file
# Show visual progress bar and Move the images to relevant split folder in working .
for i in tqdm(range(len(data_split_df))):
    row = data_split_df.loc[i]
    
    if row.split == 'train':
        # Get image id
        img_id = row.id
        # Get image-level label
        label = row.image_level
        if label=='opacity':
            file_name = f'/kaggle/working/datasetresize/labels/train/{img_id}.txt'
            # Get bboxes
            bboxes = get_bbox(row)
            # Scale bounding boxes
            scale_bboxes = scale_bbox(row, bboxes)
            # Format for YOLOv5
            yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)

            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    bbox = [row.image_level_id] + bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')
        
    else:
        # Get image id
        img_id = row.id
        # Get image-level label
        label = row.image_level
        if label=='opacity':
            file_name = f'/kaggle/working/datasetresize/labels/val/{img_id}.txt'
            # Get bboxes
            bboxes = get_bbox(row)
            # Scale bounding boxes
            scale_bboxes = scale_bbox(row, bboxes)
            # Format for YOLOv5
            yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)

            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    bbox = [row.image_level_id] + bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')
        

In [None]:
# Validation for image and corresponding boxes (txt)
import matplotlib.image as mpimg
# Read Images
image = mpimg.imread('/kaggle/working/datasetresize/images/train/000a312787f2.jpg')  
# Output Images
plt.imshow(image)

In [None]:
lbdf = pd.read_csv('/kaggle/working/datasetresize/labels/train/000a312787f2.txt')
print(lbdf)

In [None]:
print(lbdf.count)

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone repo
%cd yolov5
# Install dependencies
%pip install -qr requirements.txt  # install dependencies

%cd ../
import torch
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
# Install W&B 
!pip install -q --upgrade wandb
# Login 
import wandb
#wandb.login()

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 
personal_key_for_api = user_secrets.get_secret("wandbpass")
! wandb login $personal_key_for_api

In [None]:
# Create .yaml file 
import yaml


data_yaml = dict(
    #path = '/kaggle/working/datasetresize',
    train = '../datasetresize/images/train',
    val = '../datasetresize/images/val',
    nc = 1,
    names = ['opacity']
)

# Note that I am creating the file in the yolov5/data/ directory.
with open(f'yolov5/data/datageoai.yaml', 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
%cat yolov5/data/datageoai.yaml

In [None]:
%cd yolov5

In [None]:
IMG_SIZE = 512
BATCH_SIZE = 64
EPOCHS = 150

!python train.py --img {IMG_SIZE} \
                     --batch {BATCH_SIZE} \
                     --epochs {EPOCHS} \
                     --data datageoai.yaml \
                     --weights yolov5s.pt \
                     --save_period 10\
                     --project yolov5-covid19-geoai-output\
                     --name yolov5s-e-150-img-512-btc-64-output