# Version note
- Ver 4: Fold 4 hyper
- Ver 5: Fold4 Finetune
- Ver 7: Fold 3 hyper
- Ver 9: Fold 3 Finetune
- Ver 10: Fold 2 hyper
- Ver 11: Fold 2 Finetune
- Ver 12: Fold 1 hyper
- Ver 13: Fold 1 Finetune
- Ver 14: Fold 0 hyper
- Ver 15: Fold 0 Finetune
# NOTES

## Step 1: Preprocessing
### 1.1: Download Data and create your own Dataset by read dicom image and resize
### 1.2: Try to merge Annotation by WBF
### 1.3: Load Dataset to kaggle, create a DataFrame with include x_mid, y_mid, w and h that follow YOLOv5 annotation bbbox format
### 1.4: Create a final Training DataFrame

## Step 2: 
### 2.1: Create all file for training include:
        - Annotation .txt file
        - train.txt and valid.txt and test.txt
        - custom.yaml
### 2.2: Hyperparameter to findout best setting of YOLOv5
        - hyp.scratch.yaml
### 2.3: Training YOLOv5 with Abnormal Images
        - hyp.finetune.yaml
## Step 3: 2 filter classes
### 2.1: Training Abnormal and Normal to have a classification model
### 2.2: Combines 2-filter classification model and Yolov5 result -> Submission file

# SET UP FOR NOTEBOOK

## Import libaries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
import seaborn as sns
import os
from sklearn.model_selection import GroupKFold
import cv2
import PIL.Image as Image


print('Setup Completed')

## Global value

In [None]:
# ===============================
#Conf for WBF
iou_thr = 0.5
skip_box_thr = 0.0001
sigma = 0.1
# ===============================
#Fold value
dim = 640 #512, 256, 'original'
fold_num = 4

## Global function

In [None]:
def Preprocessing(input_path, output_path):
    #Example image
    #Read image
    img = cv2.imread(input_path)
    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    #Histogram Equlization
    # create a CLAHE object
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    cl1 = clahe.apply(gray_image)
    img_f = cv2.cvtColor(cl1, cv2.COLOR_GRAY2BGR)
    
    #Normalization
    norm_img = np.zeros((800,800))
    n_img = cv2.normalize(img_f,  norm_img, 0, 255, cv2.NORM_MINMAX)
    
    final_image = Image.fromarray(n_img)
    final_image.save(output_path)
    
def create_annotation(image_id,output_path,data):
    save_txt_path = os.path.join(output_path, image_id+".txt") 
    file = open(save_txt_path, "w+")
    image_data = data.loc[data.image_id == image_id]
    for i in image_data.index:
        object_label = image_data["class_id"][i]
        x_centre = image_data["x_mid"][i]
        y_centre = image_data["y_mid"][i]
        w = image_data["w"][i]
        h = image_data["h"][i]
        file.write(f'{object_label} {x_centre} {y_centre} {w} {h}\n')
    file.close()
    


# STEP1: IMPORT DATA + SPLIT DATASET + MERGE ANNOTATION

## Read data

In [None]:
#Define folder path (Custom)
TRAIN_DIR = '/kaggle/input/vinbigdata-640pixel/train_vin.csv'
TEST_DIR = '/kaggle/input/vinbigdata-640pixel/test_vin.csv'
#Define folder path (Origin)
Origin_TRAIN_DIR = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv'

## Merge Annotation
We have annotations from many radiologists. They annotated the same issue, but different target box. -> we merge the boxes.

In [None]:
!pip install ensemble-boxes
from ensemble_boxes import *

print('Setup WBF completed')

In [None]:
df = pd.read_csv(Origin_TRAIN_DIR)
df.fillna(0, inplace=True)
df.loc[df["class_id"] == 14, ['x_max', 'y_max']] = 1.0

results = []
image_ids = df["image_id"].unique()

for image_id in tqdm(image_ids, total=len(image_ids)):

    # All annotations for the current image.
    data = df[df["image_id"] == image_id]
    data = data.reset_index(drop=True)

    annotations = {}
    weights = []

    # WBF expects the coordinates in 0-1 range.
    max_value = data.iloc[:, 4:].values.max()
    data.loc[:, ["x_min", "y_min", "x_max", "y_max"]] = data.iloc[:, 4:] / max_value

    # Loop through all of the annotations
    for idx, row in data.iterrows():

        rad_id = row["rad_id"]

        if rad_id not in annotations:
            annotations[rad_id] = {
                "boxes_list": [],
                "scores_list": [],
                "labels_list": [],
            }

            # We consider all of the radiologists as equal.
            weights.append(1.0)

        annotations[rad_id]["boxes_list"].append([row["x_min"], row["y_min"], row["x_max"], row["y_max"]])
        annotations[rad_id]["scores_list"].append(1.0)
        annotations[rad_id]["labels_list"].append(row["class_id"])

    boxes_list = []
    scores_list = []
    labels_list = []

    for annotator in annotations.keys():
        boxes_list.append(annotations[annotator]["boxes_list"])
        scores_list.append(annotations[annotator]["scores_list"])
        labels_list.append(annotations[annotator]["labels_list"])

    # Calculate WBF
    boxes, scores, labels = weighted_boxes_fusion(
        boxes_list,
        scores_list,
        labels_list,
        weights=weights,
        iou_thr=iou_thr,
        skip_box_thr=skip_box_thr
    )

    for idx, box in enumerate(boxes):
        results.append({
            "image_id": image_id,
            "class_id": int(labels[idx]),
            "rad_id": "wbf",
            "x_min": box[0] * max_value,
            "y_min": box[1] * max_value,
            "x_max": box[2] * max_value,
            "y_max": box[3] * max_value,
        })

Final_df = pd.DataFrame(results)
display(df.head())
display(Final_df.head())
print(f'Size of origin Dataframe: {df.shape}')
print(f'Size of WBF Dataframe: {Final_df.shape}')
print(f'Number of images: {len(image_ids)}')

In [None]:
#Custom dataset
train_df = pd.read_csv(TRAIN_DIR)
#test_df = pd.read_csv(TEST_DIR)
display(train_df.head())
print(train_df.shape)

In [None]:
width = {}
height = {}
for indx in tqdm(image_ids, total=len(image_ids)):
    width.update({indx:train_df[train_df.image_id == indx].width.unique()[0]})
    height.update({indx:train_df[train_df.image_id == indx].height.unique()[0]})

In [None]:
#ADD width and height of image from train_df to Final_df
Final_df['width'] = Final_df.apply(lambda row: width[row.image_id], axis =1)
Final_df['height'] = Final_df.apply(lambda row: height[row.image_id], axis =1)

## Caculate x_mid y_mid height_bbox width_bbox

In [None]:
Final_df['x_mid'] = Final_df.apply(lambda row: ((row.x_max)/row.width+(row.x_min)/row.width)/2, axis =1)
Final_df['y_mid'] = Final_df.apply(lambda row: ((row.y_max)/row.height+(row.y_min)/row.height)/2, axis =1)
Final_df['w'] = Final_df.apply(lambda row: ((row.x_max)/row.width-(row.x_min)/row.width), axis =1)
Final_df['h'] = Final_df.apply(lambda row: ((row.y_max)/row.height-(row.y_min)/row.height), axis =1)

In [None]:
display(Final_df.head())

## Split data to Normal and Abnormal

In [None]:
# ===============================
#Abnormal
abnormal_train_df = Final_df[Final_df.class_id!=14].reset_index(drop = True)
print('Abnormal: ')
display(abnormal_train_df.head())
print(f'Number of Abnormal value: {abnormal_train_df.shape[0]}')
print(f'Number of Abnormal image: {len(abnormal_train_df.image_id.unique())}')
# ===============================
#Normal
normal_train_df = Final_df[Final_df.class_id==14].reset_index(drop = True)
print('\nNormal: ')
display(normal_train_df.head())
print(f'Number of Normal value: {normal_train_df.shape[0]}')
print(f'Number of Normal image: {len(normal_train_df.image_id.unique())}')
# ===============================

## List of classes

In [None]:
# ===============================
#List of Disease in Data
class_ids, class_names = list(zip(*set(zip(df.class_id, df.class_name))))
classes = list(np.array(class_names)[np.argsort(class_ids)])
classes = list(map(lambda x: str(x), classes))
classes.pop()
classes
# ===============================

# Step 2:
## 2.1: Create all file for training include


### Cross validation

In [None]:
gkf  = GroupKFold(n_splits = 5)
abnormal_train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(abnormal_train_df, groups = abnormal_train_df.image_id.tolist())):
    abnormal_train_df.loc[val_idx, 'fold'] = fold
display(abnormal_train_df.head())

train_files = []
val_files   = []
val_files += list(abnormal_train_df[abnormal_train_df.fold==fold_num].image_id.unique())
train_files += list(abnormal_train_df[abnormal_train_df.fold!=fold_num].image_id.unique())
print(len(train_files))
print(len(val_files))


### Copy Files from input to working directory (Also apply Preprocessing)

In [None]:
Dir_origin_image = '/kaggle/input/vinbigdata-640pixel/Train'
# ===============================    
train_image_path = '/kaggle/working/custom_data/images/train'
train_labels_path = '/kaggle/working/custom_data/labels/train'
val_image_path = '/kaggle/working/custom_data/images/val'
val_labels_path = '/kaggle/working/custom_data/labels/val'
# ===============================    
os.makedirs(train_image_path, exist_ok = True)
os.makedirs(val_image_path, exist_ok = True)
os.makedirs(train_labels_path, exist_ok = True)
os.makedirs(val_labels_path, exist_ok = True)
# ===============================    
#Copy, processing image from input to Working and create annotation file
for image_index in tqdm(train_files, total=len(train_files)):
    path_origin_images = os.path.join(Dir_origin_image, image_index +".jpg")
    # ===============================
    path_images = os.path.join(train_image_path, image_index +".jpg")
    path_label = os.path.join(train_labels_path, image_index +".txt")
    # ===============================
    Preprocessing(path_origin_images, path_images)
    create_annotation(image_index,train_labels_path,abnormal_train_df)
# ===============================    
for image_index in tqdm(val_files, total=len(val_files)):
    path_origin_images = os.path.join(Dir_origin_image, image_index +".jpg")
    # ===============================
    path_images = os.path.join(val_image_path, image_index +".jpg")
    path_label = os.path.join(val_labels_path, image_index +".txt")
    # ===============================
    Preprocessing(path_origin_images, path_images)
    create_annotation(image_index,val_labels_path,abnormal_train_df)
# ===============================       
#Create train.txt and test.txt
Dir_custom = '/kaggle/working/custom_data'
train = 'train'
test = 'test'
# ===============================    
train_txt_path = os.path.join(Dir_custom,'train.txt')
test_txt_path = os.path.join(Dir_custom,'test.txt') 
# ===============================    
file_train = open(train_txt_path, "w+")
for image_id in tqdm(train_files,total=len(train_files)):
    file_path = os.path.join(train_image_path, image_id +".jpg")
    file_train.write(f'{file_path}\n')
file_train.close()
# ===============================    
file_test = open(test_txt_path, "w+")
for image_id in tqdm(val_files,total=len(val_files)):
    file_path = os.path.join(val_image_path, image_id +".jpg")
    file_test.write(f'{file_path}\n')
file_test.close()

### Create custom.yaml files

In [None]:
#Create custom.yaml files
from os.path import isfile, join
import yaml
data = dict(
    train =  train_txt_path ,
    val   =  test_txt_path,
    nc    = 14,
    names = classes
)
with open(join( Dir_custom , f'custom.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(join( Dir_custom , f'custom.yaml'), 'r')
print('\nyaml:')
print(f.read())

# YOLOv5

## Install Enviroment

In [None]:
#cloning yolov5 model
!git clone https://github.com/ultralytics/yolov5

#cloning NVIDIA/apex to speed up the process
!git clone https://github.com/NVIDIA/apex.git

In [None]:
import torch
from IPython.display import Image, clear_output  # to display images
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

In [None]:
!mv yolov5/* ./

In [None]:
!pip install -r requirements.txt

In [None]:
!python detect.py --weights yolov5s.pt --img 640 --conf 0.25 --source /kaggle/working/data/images/zidane.jpg

In [None]:

Image(filename='/kaggle/working/runs/detect/exp/zidane.jpg', width=600)

## Hyper parameter YOLOv5

import shutil 
train_input = os.path.join('/kaggle/input/trainfile','train.py')
train_working = os.path.join('/kaggle/working','train.py')
shutil.copy(train_input,train_working)

In [None]:
!WANDB_MODE="dryrun"  python train.py --img 640 --batch 16 --epochs 100 --data /kaggle/working/custom_data/custom.yaml --weights yolov5s.pt --hyp /kaggle/working/data/hyp.scratch.yaml

#!WANDB_MODE="dryrun"  python train.py --batch 16 --epochs 100 --data /kaggle/working/custom_data/custom.yaml --weights /kaggle/input/weight/best_fold0_stratch.pt --hyp /kaggle/working/data/hyp.finetune.yaml



In [None]:
from IPython.display import FileLink
FileLink(r'runs/train/exp/weights/best.pt')

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(30,15))
plt.axis('off')
plt.imshow(plt.imread('runs/train/exp/results.png'));