*references*
- [https://www.kaggle.com/ayuraj/train-covid-19-detection-using-yolov5](https://www.kaggle.com/ayuraj/train-covid-19-detection-using-yolov5)

- input 이미지의 bbox 영역을 사용해서 chest의 opacity 영역을 찾아내는 것.

## Import and setup
- YOLOv5는 확실한 directory structure을 필요로함.

*references*
- [https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data](https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data)

```
/parent_folder
    /dataset
         /images
         /labels
    /yolov5
```

In [None]:
%cd ../
!mkdir tmp1
%cd tmp1

In [None]:
# download YOLOv5
!git clone https://github.com/ultralytics/yolov5
%cd yolov5

# install dependencies
%pip install -qr requirements.txt

%cd ../
import torch
print(f"setup complete. Using torch {torch.__version__}({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
# install wandb
!pip install -q --upgrade wandb

#login
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("mysecret")

wandb.login(key=secret_value_0)

In [None]:
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import copyfile # src인 파일의 내용을(메타 데이터 없이) 이름이 dst인 파일에 복사하고 dst를 반환함
# src, dst는 경로류 객체나 문자열로 지정된 경로 이름
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# iPython writefile Customize
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [None]:
# hyper params
TRAIN_PATH = 'input/siim-covid19-resized-to-256px-jpg/train/'
IMG_SIZE = 256
BATCH_SIZE = 16
EPOCHS = 10

## Prepare Dataset
- train-valid split 
- /dataset folder 생성
- 모델 훈련을 위한 data.yaml 파일 생성
- yolo format에 요구되는 bbox 생성

In [None]:
#everything is done from /kaggle directory
%cd ../

#load image level csv file
df = pd.read_csv('input/siim-covid19-detection/train_image_level.csv')

#modify values in the id column
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
# add absolute path
df['path']= df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# get image level labels
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

df.head()

In [None]:
# load meta.csv file
# ori dimentions are required to scale the bbox coordinates appropriately.
meta_df = pd.read_csv('input/siim-covid19-resized-to-256px-jpg/meta.csv')
meta_df.head()

In [None]:
train_meta_df = meta_df.loc[meta_df.split=='train'] #split에서 test, train있는데 train인 것만 추출
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns=['id','dim0','dim1']

train_meta_df.head()

In [None]:
# merge both the df
df = df.merge(train_meta_df, on='id', how='left') # 같은 id 기준으로 meta.df와 merge하는데 기존의 df는 다 포함
df.head()

# Train-validation split

In [None]:
#create train and validation split
# stratify를 image_level로 함으로써 class 비율을 유지시킴 즉, 한쪽에 쏠려서 분배되는 것을 방지함 (성능차이나는 것을 방지)
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify= df.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df])
df.head()

In [None]:
df = pd.concat([train_df, valid_df]).reset_index(drop=True)
df.head()

In [None]:
print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

## Prepare Required Folder Structure

```
/parent_folder
    /dataset
         /images
             /train
             /val
         /labels
             /train
             /val
    /yolov5
```

In [None]:
os.makedirs('tmp1/covid19/images/train', exist_ok=True)
os.makedirs('tmp1/covid19/images/valid', exist_ok=True)

os.makedirs('tmp1/covid19/labels/train', exist_ok=True)
os.makedirs('tmp1/covid19/labels/valid', exist_ok=True)

! ls tmp1/covid19/images

In [None]:
# 이미지들을 관련된 폴더로 split함
# tqdm을 사용함으로써 반복문이 어디까지 진행되었는지 알 수 있음
for i in tqdm(range(len(df))):
    row= df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'tmp1/covid19/images/train/{row.id}.jpg')
    else:
        copyfile(row.path, f'tmp1/covid19/images/valid/{row.id}.jpg')
    

## Create .YAML file
- 중요한 점
- 각 이미지의 image-level은 opacity or none이다. 따라서 number of classes는 2로 한다. => nc =2
- YOLOv5는 자동적으로 어떠한 bbox 좌표없이 이미지를 핸들링함
- NOTE: data.yaml은 yolov5/data directory에 생성됨

In [None]:
# create .yaml file
import yaml

data_yaml= dict(
    # 'optional'download command/URL
    train='../covid19/images/train', # training image경로
    val= '../covid19/images/valid',# validation image경로
    nc=2, # image_level
    names=['none','opacity']
)

# yolov5/data/directory에 만듬
with open('tmp1/yolov5/data/data.yaml','w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)

# cat이란 linux 명령어로 두개이상의 파일을 연결해서 출력할 때 사용 혹은 연결할 파일을 설정하지 않은 경우에는 쉽게 말해서 "이 파일에 무슨 내용이 있는지 보여줘"라는 뜻이다. 
%cat tmp1/yolov5/data/data.yaml

## Prepare bbox coordinated for YOLOv5
- 1개의 object 당 한개의 row
- 각 row는 class (x_center, y_center, width, height) format이다.
- box 좌표는 반드시 normalize된 xywh format이다(0-1). image width에서 x_center, width 그리고 image height에서 y_center, height를 나눔으로써 normalize된거 구함
- class number은 0부터 

In [None]:
df

In [None]:
# label col의 row value를 분석하여 raw bbox를 가져옴
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes

# label format 
# EX) opacity 1 139.94981 1345.84148 1541.7312000000002 2694.97368 opacity 1 2098.93353 1089.17739 3270.37518 2699.36111
def get_bbox(row):
    bboxes=[]
    bbox=[]
    for i,l in enumerate(row.label.split(' ')):
        if(i%6 ==0)|(i%6==1):
            continue
        bbox.append(float(l)) # 좌표가 존재하면 bbox에 float 형식으로 넣음
        if i%6==5:
            bboxes.append(bbox) 
            bbox=[]
            
    return bboxes

# resized image의 사이즈에 따라 bbox scale 뜸
def scale_bbox(row, bboxes):
    # get scaling factor
    scale_x = IMG_SIZE/row.dim1 # 256을 orginal dimension x로 나눈 값 -> 즉 얼만큼의 비율로 감소or 증가되었는지 확인 가능! 
    scale_y = IMG_SIZE/row.dim0 # 256을 orginal dimension y로 나눈 값을 scale_x에 저장 
    
    scaled_bboxes=[]
    for bbox in bboxes:
        x= int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))
        
        scaled_bboxes.append([x,y,x1,y1]) # xmin, ymin, xmax, ymax
    return scaled_bboxes

# YOLO format에 bbox들을 변환
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes=[]
    for bbox in bboxes:
        w= bbox[2]-bbox[0] # xmax-xmin = 넓이
        h= bbox[3]-bbox[1] # ymaz-ymin = 높이
        xc= bbox[0] + int(np.round(w/2)) # xmin + width/2 = x_center
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # yolo format인 x-center, y-center, width, height로!
        
    return yolo_boxes
         

In [None]:
# bbox를 위한 txt 파일 준비
for i in tqdm(range(len(df))):
    row= df.loc[i]
    # get image id
    img_id= row.id
    #get split
    split= row.split
    # get image-level label
    label= row.image_level
    
    if row.split == 'train':
        file_name=f'tmp1/covid19/labels/train/{row.id}.txt'
    else:
        file_name=f'tmp1/covid19/labels/valid/{row.id}.txt'
    
    if label=='opacity':
        #get bboxes
        bboxes= get_bbox(row)
        scale_bboxes= scale_bbox(row, bboxes)
        #format for yolov5
        yolo_bboxes= get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes) # 256,256, scale_bboxes
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox= [1]+bbox
                bbox = [str(i) for i in bbox]
                bbox=' '.join(bbox)
                f.write(bbox)
                f.write('\n')

## train with wandb

In [None]:
%cd tmp1/yolov5/

```
--img {IMG_SIZE} \ # Input image size.
--batch {BATCH_SIZE} \ # Batch size
--epochs {EPOCHS} \ # Number of epochs
--data data.yaml \ # Configuration file
--weights yolov5s.pt \ # Model name
--save_period 1\ # Save model after interval
--project kaggle-siim-covid # W&B project name
```

In [None]:
!python train.py --img {IMG_SIZE} \
                 --batch {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data data.yaml \
                 --weights yolov5s.pt \
                 --save_period 1\
                 --project kaggle-siim-covid

In [None]:
TEST_PATH = '/kaggle/input/siim-covid19-resized-to-256px-jpg/test/' # absolute path

In [None]:
MODEL_PATH = 'kaggle-siim-covid/exp/weights/best.pt'

```
--weights {MODEL_PATH} \ # path to the best model.
--source {TEST_PATH} \ # absolute path to the test images.
--img {IMG_SIZE} \ # Size of image
--conf 0.281 \ # Confidence threshold (default is 0.25)
--iou-thres 0.5 \ # IOU threshold (default is 0.45)
--max-det 3 \ # Number of detections per image (default is 1000) 
--save-txt \ # Save predicted bounding box coordinates as txt files
--save-conf # Save the confidence of prediction for each bounding box
```

In [None]:
!python detect.py --weights {MODEL_PATH} \
                  --source {TEST_PATH} \
                  --img {IMG_SIZE} \
                  --conf 0.281 \
                  --iou-thres 0.5 \
                  --max-det 3 \
                  --save-txt \
                  --save-conf

In [None]:
PRED_PATH = 'runs/detect/exp/labels'
!ls {PRED_PATH}

In [None]:
# 예측한 좌표들을 visualize
%cat runs/detect/exp/labels/87c51db67bf7.txt

> 1 -> class id(opacity), and then (x_center, y_center, width, height) , 마지막 float은 confidence

In [None]:
prediction_files=os.listdir(PRED_PATH)
print('Number of test images predicted as opaque: ', len(prediction_files))

In [None]:
# submission -> xmin, ymin, xmax, ymax
# YOLOv5 -> x_center, y_center, width, height 를 반환
def correct_bbox_format(bboxes):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[0]*IMG_SIZE)), int(np.round(b[1]*IMG_SIZE))
        w, h = int(np.round(b[2]*IMG_SIZE)), int(np.round(b[3]*IMG_SIZE))

        xmin = xc - int(np.round(w/2))
        xmax = xc + int(np.round(w/2))
        ymin = yc - int(np.round(h/2))
        ymax = yc + int(np.round(h/2))
        
        correct_bboxes.append([xmin, xmax, ymin, ymax])
        
    return correct_bboxes

# inference 와 extract하는 동안 YOLOv5에 의해 만들어진 txt file을 읽음
# confidence and bounding box coordinates.
def get_conf_bboxes(file_path):
    confidence = []
    bboxes = []
    with open(file_path, 'r') as file:
        for line in file:
            preds = line.strip('\n').split(' ')
            preds = list(map(float, preds))
            confidence.append(preds[-1])
            bboxes.append(preds[1:-1])
    return confidence, bboxes

In [None]:
# submission file 읽기
sub_df = pd.read_csv('/kaggle/input/siim-covid19-detection/sample_submission.csv')
sub_df.tail()

In [None]:
# pred loop
predictions = []

for i in tqdm(range(len(sub_df))):
    row = sub_df.loc[i]
    id_name = row.id.split('_')[0]
    id_level = row.id.split('_')[-1]
    
    if id_level == 'study':
        # do study-level classification
        predictions.append("Negative 1 0 0 1 1") # dummy prediction
        
    elif id_level == 'image':
        # we can do image-level classification here.
        # also we can rely on the object detector's classification head.
        # for this example submisison we will use YOLO's classification head. 
        # since we already ran the inference we know which test images belong to opacity.
        if f'{id_name}.txt' in prediction_files:
            # opacity label
            confidence, bboxes = get_conf_bboxes(f'{PRED_PATH}/{id_name}.txt')
            bboxes = correct_bbox_format(bboxes)
            pred_string = ''
            for j, conf in enumerate(confidence):
                pred_string += f'opacity {conf} ' + ' '.join(map(str, bboxes[j])) + ' '
            predictions.append(pred_string[:-1]) 
        else:
            predictions.append("None 1 0 0 1 1")

In [None]:
os.chdir("/kaggle/working")

In [None]:
sub_df['PredictionString'] = predictions
sub_df.to_csv('submission1.csv', index=False)
sub_df.tail()