In [None]:
%cd ../
!mkdir tmp
%cd tmp

In [None]:
# Download YOLOv5
!git clone https://github.com/ultralytics/yolov5 # clone repo
%cd yolov5
# Install dependencies requirements.txt 是用來表達你具體的環境是如何。
%pip install -qr requirements.txt  # install dependencies

%cd ../
#pytorch
import torch
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
'''
# Install W&B 
!pip install -q --upgrade wandb
# Login 登入wandb
import wandb
!wandb login 2475fd7c9b2de3f6cc173a97474a8ab98233627c
'''

In [None]:
# Necessary/extra dependencies. 
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import copyfile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#customize iPython writefile so we can write variables自定義 iPython 寫入文件，以便我們可以寫入變量
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [None]:
#Hyperparameters超參數
TRAIN_PATH = 'input/siim-covid19-resized-to-256px-jpg/train/'
#圖片大小
IMG_SIZE = 256
#每批資料量
BATCH_SIZE = 16
#迭代次數(全部資料過train算1)
EPOCHS = 10

In [None]:
# Everything is done from /kaggle directory.
%cd ../

# Load image level csv file載入檔案
df = pd.read_csv('input/siim-covid19-detection/train_image_level.csv')

# Modify values in the id column分割ID列的ID與image
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)#axis是column
# Add absolute path新增一列path
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# Get image level labels新增一列image_level
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)#[0]取分割的第一個
#顯示前五個資料
df.head(5)

In [None]:
# Load meta.csv file 載入meta.csv
# Original dimensions are required to scale the bounding box coordinates appropriately.
meta_df = pd.read_csv('input/siim-covid19-resized-to-256px-jpg/meta.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']#提出有train標籤的資料
train_meta_df = train_meta_df.drop('split', axis=1)#刪除整個column的split
train_meta_df.columns = ['id', 'dim0', 'dim1']#把頭三行改成'id', 'dim0', 'dim1'

train_meta_df.head(2)

In [None]:
# Merge both the dataframes組合兩個檔案成一個檔案
df = df.merge(train_meta_df, on='id',how="left")#依照id組合(df第一個datafile，train_meta_df第二個datafile)，left:資料庫LEFT OUTER JOIN
df.head(2)

In [None]:
# Create train and validation split建立與拆分訓練和驗證集
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df.image_level.values)#stratify依照檔案分成設定比例

train_df.loc[:, 'split'] = 'train'#多設一行標籤為train(loc:用index的標籤來取出資料)
valid_df.loc[:, 'split'] = 'valid'#多設一行標籤為valid

df = pd.concat([train_df, valid_df]).reset_index(drop=True)#組合兩個表 reset index=頭疼
df.head()

In [None]:
#顯示train和test個數
print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

In [None]:
os.makedirs('tmp/covid/images/train', exist_ok=True)#在images底下建立train和valid資料夾
os.makedirs('tmp/covid/images/valid', exist_ok=True)

os.makedirs('tmp/covid/labels/train', exist_ok=True)#在labels底下建立train和valid資料夾
os.makedirs('tmp/covid/labels/valid', exist_ok=True)

! ls tmp/covid/images

In [None]:
'''
# Move the images to relevant split folder.把圖像依照剛剛合併的excel檔標籤train or valid分類到兩個資料夾下
for i in tqdm(range(len(df))):
    row = df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'tmp/covid/images/train/{row.id}.jpg')
    else:
        copyfile(row.path, f'tmp/covid/images/valid/{row.id}.jpg')
df.head()
'''

In [None]:
# Create .yaml file建立.yaml檔
import yaml
#dict是key and value
data_yaml = dict(
    train = '../covid/images/train',
    val = '../covid/images/valid',
    nc = 2,#兩個類別
    names = ['none', 'opacity']#類別分別叫none，opacity
)

# Note that I am creating the file in the yolov5/data/ directory.建立"tmp/yolov5/data/data.yaml"
# 打開檔案w是覆寫
with open('tmp/yolov5/data/data.yaml', 'w') as outfile:
    #dump轉成str寫入json檔
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
#cat 輸出檔案內容
%cat tmp/yolov5/data/data.yaml

In [None]:
# Get the raw bounding box by parsing the row value of the label column.從excel得到bbox值
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    #用enumerate取的i是list的索引值，這個for只取label裡的opacity(none)和1
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            #contunue跳過本次的for迴圈
            continue
        #把x y w h放入bbox
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image.設置bbox在256size下的新值
def scale_bbox(row, bboxes):
    # Get scaling factor IMG_SIZE=256
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))
        
        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.bbox轉成yolo格式
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    return yolo_boxes

In [None]:
'''
#test
row = df.loc[0]
img_id = row.id
split = row.split
label = row.image_level
print("img_id:"+img_id,"split:"+split,"label:"+label)
if label=='opacity':
    # Get bboxes
    bboxes = get_bbox(row)
    # Scale bounding boxes
    scale_bboxes = scale_bbox(row, bboxes)
    # Format for YOLOv5
    yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)

''' 
'''
# Prepare the txt files for bounding box準備bbox的txt檔
for i in tqdm(range(len(df))):#tqdm進度條
    row = df.loc[i]
    # Get image id
    img_id = row.id
    # Get split
    split = row.split
    # Get image-level label(opacity or none)
    label = row.image_level
    
    if row.split=='train':
        file_name = f'tmp/covid/labels/train/{row.id}.txt'
    else:
        file_name = f'tmp/covid/labels/valid/{row.id}.txt'
        
    
    if label=='opacity':
        # Get bboxes
        bboxes = get_bbox(row)
        # Scale bounding boxes
        scale_bboxes = scale_bbox(row, bboxes)
        # Format for YOLOv5
        yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox = [1]+bbox
                bbox = [str(i) for i in bbox]
                bbox = ' '.join(bbox)
                f.write(bbox)
                f.write('\n')
   '''

In [None]:
%cd tmp/yolov5/

In [None]:
'''
--img {IMG_SIZE} \ # 輸入圖片大小
--batch {BATCH_SIZE} \ # 每批訓練數量
--epochs {EPOCHS} \ # 歷遍資料次數
--data data.yaml \ # 設定檔
--weights yolov5s.pt \ # 模組名稱
--save_period 1\ # 多久間隔後存檔
--project kaggle-siim-covid # W&B project name


!python train.py --img {IMG_SIZE} \
                 --batch {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data data.yaml \
                 --weights yolov5s.pt \
                 --project kaggle-siim-covid
'''

In [None]:
TEST_PATH = '/kaggle/input/siim-covid19-resized-to-256px-jpg/test/'
MODEL_PATH = '/kaggle/working/yolov5/kaggle-siim-covid/exp/weights/best.pt'

In [None]:
os.makedirs('/kaggle/working/yolov5/kaggle-siim-covid/exp/weights', exist_ok=True)
import shutil
src=r"/kaggle/input/mymodule/best.pt"
des=r'/kaggle/working/yolov5/kaggle-siim-covid/exp/weights/best.pt'
shutil.copy(src,des)


In [None]:
'''
--weights {MODEL_PATH} \ # 模型路徑
--source {TEST_PATH} \ # 測試集的絕對路徑
--img {IMG_SIZE} \ # 圖片大小
--conf 0.281 \ # Confidence值的最低值 (預設0.25)
--iou-thres 0.5 \ # IOU最低值 (預設0.45)
--max-det 3 \ # 每張圖的檢測次數 (預設1000次) 
--save-txt \ # 把bbox的座標存成txt檔
--save-conf # 保存每個預測bbox的confidence值
'''
!python detect.py --weights {MODEL_PATH} \
                  --source {TEST_PATH} \
                  --img {IMG_SIZE} \
                  --conf 0.281 \
                  --iou-thres 0.5 \
                  --max-det 3 \
                  --save-txt \
                  --save-conf


In [None]:

#bbox的座標的txt檔位置
PRED_PATH = 'runs/detect/exp/labels'
!ls {PRED_PATH}


In [None]:

# Visualize predicted coordinates.
%cat runs/detect/exp/labels/ba91d37ee459.txt


In [None]:

#列出txt檔裡的檔案(裡面全都是不透明的預測)
prediction_files = os.listdir(PRED_PATH)
print('Number of test images predicted as opaque: ', len(prediction_files))



#最後不透明度數量有774張


In [None]:
# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[0]*IMG_SIZE)), int(np.round(b[1]*IMG_SIZE))
        w, h = int(np.round(b[2]*IMG_SIZE)), int(np.round(b[3]*IMG_SIZE))

        xmin = xc - int(np.round(w/2))
        xmax = xc + int(np.round(w/2))
        ymin = yc - int(np.round(h/2))
        ymax = yc + int(np.round(h/2))
        
        correct_bboxes.append([xmin, xmax, ymin, ymax])
        
    return correct_bboxes

# Read the txt file generated by YOLOv5 during inference and extract 
# confidence and bounding box coordinates.
def get_conf_bboxes(file_path):
    confidence = []
    bboxes = []
    with open(file_path, 'r') as file:
        for line in file:
            preds = line.strip('\n').split(' ')
            preds = list(map(float, preds))
            confidence.append(preds[-1])
            bboxes.append(preds[1:-1])
    return confidence, bboxes

In [None]:
# Read the submisison file
sub_df = pd.read_csv('/kaggle/input/siim-covid19-detection/sample_submission.csv')
sub_df.tail()

In [None]:
# Prediction loop for submission
predictions = []

for i in tqdm(range(len(sub_df))):
    row = sub_df.loc[i]
    id_name = row.id.split('_')[0]
    id_level = row.id.split('_')[-1]
    
    if id_level == 'study':
        # do study-level classification
        predictions.append("Negative 1 0 0 1 1") # dummy prediction
        
    elif id_level == 'image':
        # we can do image-level classification here.
        # also we can rely on the object detector's classification head.
        # for this example submisison we will use YOLO's classification head. 
        # since we already ran the inference we know which test images belong to opacity.
        if f'{id_name}.txt' in prediction_files:
            # opacity label
            confidence, bboxes = get_conf_bboxes(f'{PRED_PATH}/{id_name}.txt')
            bboxes = correct_bbox_format(bboxes)
            pred_string = ''
            for j, conf in enumerate(confidence):
                pred_string += f'opacity {conf} ' + ' '.join(map(str, bboxes[j])) + ' '
            predictions.append(pred_string[:-1]) 
        else:
            predictions.append("None 1 0 0 1 1")

In [None]:
sub_df['PredictionString'] = predictions
sub_df.to_csv('submission.csv', index=False)
file=f'submission.csv'
destination=f'/kaggle/working'
shutil.copy(file,destination)
sub_df.tail()

In [None]:
# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[0]*IMG_SIZE)), int(np.round(b[1]*IMG_SIZE))
        w, h = int(np.round(b[2]*IMG_SIZE)), int(np.round(b[3]*IMG_SIZE))

        xmin = xc - int(np.round(w/2))
        xmax = xc + int(np.round(w/2))
        ymin = yc - int(np.round(h/2))
        ymax = yc + int(np.round(h/2))
        
        correct_bboxes.append([xmin, xmax, ymin, ymax])
        
    return correct_bboxes

# Read the txt file generated by YOLOv5 during inference and extract 
# confidence and bounding box coordinates.
def get_conf_bboxes(file_path):
    confidence = []
    bboxes = []
    with open(file_path, 'r') as file:
        for line in file:
            preds = line.strip('\n').split(' ')
            preds = list(map(float, preds))
            confidence.append(preds[-1])
            bboxes.append(preds[1:-1])
    return confidence, bboxes

In [None]:
# Read the submisison file
sub_df = pd.read_csv('/kaggle/input/siim-covid19-detection/sample_submission.csv')
sub_df.tail()

In [None]:
# Prediction loop for submission
predictions = []

for i in tqdm(range(len(sub_df))):
    row = sub_df.loc[i]
    id_name = row.id.split('_')[0]
    id_level = row.id.split('_')[-1]
    
    if id_level == 'study':
        # do study-level classification
        predictions.append("Negative 1 0 0 1 1") # dummy prediction
        
    elif id_level == 'image':
        # we can do image-level classification here.
        # also we can rely on the object detector's classification head.
        # for this example submisison we will use YOLO's classification head. 
        # since we already ran the inference we know which test images belong to opacity.
        if f'{id_name}.txt' in prediction_files:
            # opacity label
            confidence, bboxes = get_conf_bboxes(f'{PRED_PATH}/{id_name}.txt')
            bboxes = correct_bbox_format(bboxes)
            pred_string = ''
            for j, conf in enumerate(confidence):
                pred_string += f'opacity {conf} ' + ' '.join(map(str, bboxes[j])) + ' '
            predictions.append(pred_string[:-1]) 
        else:
            predictions.append("None 1 0 0 1 1")

In [None]:
sub_df['PredictionString'] = predictions
sub_df.to_csv('submission.csv', index=False)
sub_df.tail()