In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!mkdir tmp
%cd tmp

In [None]:
!pip install torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install numpy==1.17
!pip install PyYAML==5.3.1
!pip install git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI

In [None]:
!git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex

### Cloning the github repository.

In [None]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import json
import urllib
import PIL.Image as Image
import cv2
import torch
import torchvision
from IPython.display import display
from sklearn.model_selection import train_test_split
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10
np.random.seed(42)

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone repo
%cd yolov5
# Install dependencies
%pip install -qr requirements.txt  # install dependencies

%cd ../
import torch
from IPython.display import Image, clear_output  # to display images

clear_output()
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

WANDB login (model artifacts will be stored on wandb account)

In [None]:
!pip install -q --upgrade wandb
# Login 
import wandb
wandb.login()

In [None]:
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import copyfile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#customize iPython writefile so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

Change this TRAIN_PATH as per convinience. It is path to modified training dataset which 

In [None]:
TRAIN_PATH = '/kaggle/input/siim-covid19-resized-384512-and-640px/SIIM-COVID19-Resized/img_sz_640/train/'
IMG_SIZE = 640
BATCH_SIZE = 16   # 16 if yolov5x
EPOCHS = 30

### Modifying train_csv(because images are resized), adding absolute path, splitting data 

loading train_image_level.csv and making some modifications.

Adding absolute path of images, adding image level labels

In [None]:
%cd ../
%cd ../
# Load image level csv file
df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv')

# Modify values in the id column
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
# Add absolute path
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# Get image level labels
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

df.head(5)

meta_df is stored in modified dataset folder. It specified dimension ratio by which images are shrunk. 

In [None]:
meta_df = pd.read_csv('/kaggle/input/siim-covid19-resized-384512-and-640px/SIIM-COVID19-Resized/img_sz_640/meta_sz_640.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns = ['id', 'dim0', 'dim1']

train_meta_df.head(2)

In [None]:
# Merge both the dataframes
df = df.merge(train_meta_df, on='id',how="left")
df.head(2)

In [None]:
# Create train and validation split.
train_df, valid_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df]).reset_index(drop=True)

In [None]:
print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

In [None]:

os.makedirs('working/tmp/covid/images/train', exist_ok=True)
os.makedirs('working/tmp/covid/images/valid', exist_ok=True)

os.makedirs('working/tmp/covid/labels/train', exist_ok=True)
os.makedirs('working/tmp/covid/labels/valid', exist_ok=True)

! ls working/tmp/covid/images

In [None]:
os.listdir('working/tmp')

In [None]:
# Move the images to relevant split folder.
for i in tqdm(range(len(df))):
    row = df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'working/tmp/covid/images/train/{row.id}.jpg')
    else:
        copyfile(row.path, f'working/tmp/covid/images/valid/{row.id}.jpg')

yaml file will be used for training

In [None]:
# Create .yaml file 
import yaml

data_yaml = dict(
    train = '../covid/images/train',
    val = '../covid/images/valid',
    nc = 1,
    names = ['opacity']
)

# Note that I am creating the file in the yolov5/data/ directory.
with open('working/tmp/yolov5/data/data.yaml', 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
%cat working/tmp/yolov5/data/data.yaml

In [None]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [None]:
# Prepare the txt files for bounding box
for i in tqdm(range(len(df))):
    row = df.loc[i]
    # Get image id
    img_id = row.id
    # Get split
    split = row.split
    # Get image-level label
    label = row.image_level
    
    if row.split=='train':
        file_name = f'working/tmp/covid/labels/train/{row.id}.txt'
    else:
        file_name = f'working/tmp/covid/labels/valid/{row.id}.txt'
        
    
    if label=='opacity':
        # Get bboxes
        bboxes = get_bbox(row)
        # Scale bounding boxes
        scale_bboxes = scale_bbox(row, bboxes)
        # Format for YOLOv5
        yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox = [0]+bbox
                bbox = [str(i) for i in bbox]
                bbox = ' '.join(bbox)
                f.write(bbox)
                f.write('\n')

In [None]:
# %cd working/tmp/yolov5/
%cd /kaggle/working/tmp/yolov5/

In [None]:
os.listdir('/kaggle/working/tmp/yolov5/data/hyps')

In [None]:
import yaml

with open('/kaggle/working/tmp/yolov5/data/hyps/hyp.scratch.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    fruits_list = yaml.load(file, Loader=yaml.FullLoader)

    print(fruits_list)

In [None]:
fruits_list["lrf"]= 0.032
fruits_list["box"]= 0.1
fruits_list["cls"]= 1.0
fruits_list["cls_pw"]= 0.5
fruits_list["obj"]= 2.0
fruits_list["obj_pw"]= 0.5
fruits_list["anchors"]= 0
fruits_list["translate"]= 0.2
fruits_list["scale"]= 0.6
fruits_list["flipud"]= 0.2
fruits_list["fliplr"]= 0.5

In [None]:
# dont change mosaic.
# apply rotation
# apply fliplr

In [None]:
with open("/kaggle/working/tmp/yolov5/data/hyps/hyp.scratch.yaml", 'w') as file:
    documents = yaml.dump(fruits_list, file)

In [None]:
with open('/kaggle/working/tmp/yolov5/data/hyps/hyp.scratch.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    fruits_list = yaml.load(file, Loader=yaml.FullLoader)

    print(fruits_list)

## training model and test images prediction

In [None]:
!python train.py    --img {IMG_SIZE} \
                    --batch {BATCH_SIZE} \
                    --epochs {150} \
                    --data data.yaml \
                    --weights yolov5l.pt \
                    --cfg models/yolov5l.yaml\
                    --save_period 1 \
                    --project kaggle-siim-covid-yolov5l-clas1-mod8

# here you can choose which model you want. Till now i have observed yolov5x gives best results

In [None]:
os.listdir("/kaggle/working/tmp/yolov5/runs")

In [None]:
!python train.py    --img {IMG_SIZE} \
                    --batch {BATCH_SIZE} \
                    --epochs {120} \
                    --data data.yaml \
                    --weights /kaggle/working/tmp/yolov5/artifacts/run_2yubez04_model:v29/best.pt \
                    --save_period 1 \
                    --project kaggle-siim-covid-yolov5l-clas1-mod7

Now your model will be stored as an artifact on wandb account and also in yolov5 directory here. You can find command to predict results over testing data

Loading pretrained model artifacts from wandb. You can ignore this if you have just trained model. Just change save path

In [None]:
run = wandb.init()

In [None]:
# artifact = run.use_artifact("39ajinkya/kaggle-siim-covid-yolov5l-t3-clas1/run_1qerm3x5_model:v24")
# artifact = run.use_artifact("39ajinkya/kaggle-siim-covid-yolov5l-clas1-mod6/run_2yubez04_model:v29")
artifact = run.use_artifact("39ajinkya/kaggle-siim-covid-yolov5l-clas1-mod8/run_3l287lxi_model:v108")

artifact_dir = artifact.download()

In [None]:
# artifact = run.use_artifact("39ajinkya/kaggle-siim-covid-yolov5x-t1-clas1/run_lkm1qq0s_model:v19")
# artifact_dir = artifact.download()

In [None]:
# run_1qerm3x5_model:v24

In [None]:
run.join()

In [None]:
os.listdir("/kaggle/working/tmp/yolov5/artifacts")

**Put proper path in cell below and run detect.py to generate results**.

In [None]:
# MODEL_PATH = "artifacts/run_2xb4vetk_model:v29/best.pt"
# MODEL_PATH = "artifacts/run_1qerm3x5_model:v24/last.pt"
MODEL_PATH = "artifacts/run_2yubez04_model:v29/best.pt"

In [None]:
# MODEL_PATH = 'kaggle-siim-covid-yolov5l-t3-clas2/exp/weights/best.pt'
TEST_PATH = '../../../input/siim-covid19-resized-to-512px-png/test/'

In [None]:
!python detect.py --weights {MODEL_PATH} \
                  --source {TEST_PATH} \
                  --img {IMG_SIZE} \
                  --conf 0.3 \
                  --iou-thres 0.5 \
                  --max-det 3 \
                  --save-txt \
                  --save-conf

In [None]:
# a = wandb.restore('39ajinkya/kaggle-siim-covid-yolov5x-t1-clas1/lkm1qq0s')
# # 39ajinkya/kaggle-siim-covid-yolov5x-t1-clas1/lkm1qq0s

In [None]:
!python train.py --resume wandb-artifact://39ajinkya/kaggle-siim-covid-yolov5l-clas1-mod6/2yubez04 \
                 --epochs {150}\
# #!python train.py --resume MODEL_PATH                 

In [None]:
os.listdir("/kaggle/working/tmp/yolov5/artifacts")

In [None]:
# os.listdir('/kaggle/tmp/yolov5/runs/detect/exp/labels')

In [None]:
PRED_PATH = 'runs/detect/exp/labels'  # it can be exp/exp2/exp3 depending on your count of running detect.py. First run will save results in exp
!ls {PRED_PATH}

In [None]:
# Visualize predicted coordinates.
%cat runs/detect/exp3/labels/ba91d37ee459.txt

In [None]:
prediction_files = os.listdir(PRED_PATH)
print('Number of test images predicted as opaque: ', len(prediction_files))

## Store results in submission.csv file  

In [None]:
# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[0]*IMG_SIZE)), int(np.round(b[1]*IMG_SIZE))
        w, h = int(np.round(b[2]*IMG_SIZE)), int(np.round(b[3]*IMG_SIZE))

        xmin = xc - int(np.round(w/2))
        xmax = xc + int(np.round(w/2))
        ymin = yc - int(np.round(h/2))
        ymax = yc + int(np.round(h/2))
        
        correct_bboxes.append([xmin, xmax, ymin, ymax])
        
    return correct_bboxes

# Read the txt file generated by YOLOv5 during inference and extract 
# confidence and bounding box coordinates.
def get_conf_bboxes(file_path):
    confidence = []
    bboxes = []
    with open(file_path, 'r') as file:
        for line in file:
            preds = line.strip('\n').split(' ')
            preds = list(map(float, preds))
            confidence.append(preds[-1])
            bboxes.append(preds[1:-1])
    return confidence, bboxes

In [None]:
# Read the submisison file
sub_df = pd.read_csv('/kaggle/input/siim-covid19-detection/sample_submission.csv')
sub_df.tail()

In [None]:
# Prediction loop for submission
predictions = []

for i in tqdm(range(len(sub_df))):
    row = sub_df.loc[i]
    id_name = row.id.split('_')[0]
    id_level = row.id.split('_')[-1]
    
    if id_level == 'study':
        # do study-level classification
        predictions.append("Negative 1 0 0 1 1") # dummy prediction
        
    elif id_level == 'image':
        # we can do image-level classification here.
        # also we can rely on the object detector's classification head.
        # for this example submisison we will use YOLO's classification head. 
        # since we already ran the inference we know which test images belong to opacity.
        if f'{id_name}.txt' in prediction_files:
            # opacity label
            confidence, bboxes = get_conf_bboxes(f'{PRED_PATH}/{id_name}.txt')
            bboxes = correct_bbox_format(bboxes)
            pred_string = ''
            for j, conf in enumerate(confidence):
                pred_string += f'opacity {conf} ' + ' '.join(map(str, bboxes[j])) + ' '
            predictions.append(pred_string[:-1]) 
        else:
            predictions.append("None 1 0 0 1 1")

In [None]:
sub_df['PredictionString'] = predictions
sub_df.to_csv('/kaggle/working/submission.csv', index=False)
sub_df.tail()

In [None]:
sub_df.loc[sub_df['PredictionString'] == "None 1 0 0 1 1"]

In [None]:
os.listdir('/kaggle/working')

creating a coco format dataset.

In [None]:
# #creating a coco format dataset.
# df.iloc[0].path

In [None]:
# annotation = dict()

# for i in tqdm(range(len(df))):
#     row = df.loc[i]
#     # Get image id
#     img_id = row.id
#     # Get split
#     split = row.split
#     # Get image-level label
#     label = row.image_level
    
#     file_name = row.path
        
    
#     if label=='opacity':
#         # Get bboxes
#         bboxes = get_bbox(row)
#         # Scale bounding boxes
#         scale_bboxes = scale_bbox(row, bboxes)
#         # Format for YOLOv5
#         yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
#         l = []
#         for bbox in yolo_bboxes:
#             l.append({'bbox':bbox,'label':'opacity'})
#         annotation[file_name] = l

In [None]:
# annotation

In [None]:
# del dataset

In [None]:
# image_path = 'kaggle/input/siim-covid19-resized-to-256px-jpg/train/*'
# glob.glob(image_path)

In [None]:
# import glob
# import fiftyone as fo

# image_path = 'kaggle/input/siim-covid19-resized-to-256px-jpg/train/*'

# # Ex: your custom label format

# # Create dataset
# dataset = fo.Dataset(name="siim-covid-19-6")

# # Persist the dataset on disk in order to 
# # be able to load it in one line in the future
# dataset.persistent = True

# # Add your samples to the dataset
# for filepath in annotation:
#     sample = fo.Sample(filepath=filepath)
#     sample.tags.append('images')
#     # Convert detections to FiftyOne format
#     detections = []
#     for obj in annotation[filepath]:
#         label = obj["label"]

#         # Bounding box coordinates should be relative values
#         # in [0, 1] in the following format:
#         # [top-left-x, top-left-y, width, height]
#         bounding_box = obj["bbox"]
        
#         detections.append(
#             fo.Detection(label=label, bounding_box=bounding_box)
#         )

#     # Store detections in a field name of your choice
#     sample["train"] = fo.Detections(detections=detections)

#     dataset.add_sample(sample)

In [None]:
# view = dataset.match_tags('images')
# for sample in view:
#     print(sample)

In [None]:
# export_dir = "/path/for/coco-detection-dataset-1"
# label_field = "ground_truth"  # for example

# # Export the dataset
# dataset.export(
#     export_dir=export_dir,
#     dataset_type=fo.types.COCODetectionDataset,
#     label_field=label_field,
# )

In [None]:
# !pip install fiftyone