# YOLO TRAINING (v5 - v8)

## IMPORTS

In [None]:
import numpy as np
from glob import glob
import shutil, os
from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm
import pandas as pd
import yaml
import torch
import matplotlib.pyplot as plt

## CONFIG VARIABLES

#### Project Config:

- root_dir :
    - train.csv
    - test.csv
    - train_meta.csv
    - kaggle.json

In [None]:
ROOT_DIR = '/kaggle/'
TRAIN_DIR = ROOT_DIR + 'data/train/'
TEST_DIR = ROOT_DIR + 'data/test/'
TRAIN_CSV = ROOT_DIR + 'train.csv'
TRAIN_META_CSV = ROOT_DIR + 'train_meta.csv'
TEST_CSV = ROOT_DIR + 'test.csv'

MODEL = 'v5' # for YOLOV5
# MODEL = 'v8' # for YOLOV8

## /!\ TO TRIGGER IF RUNNING ON KAGGLE ONLY /!\

In [None]:
def is_kaggle():
    """
    If the code is running on Kaggle take custom data from private repo and copy to root directory
    /!\ On Kaggle I have a private dataset (data-ai) containing: /!\
    - train.csv
    - test.csv
    - train_meta.csv
    - kaggle.json
    You can use your own data or download from Kaggle, just change the paths
    """
    os.chdir('..')
    !cp input/data-ai/kaggle.json .
    !cp input/data-ai/test.csv .
    !cp input/data-ai/train.csv .
    !cp input/data-ai/train_meta.csv .
    !pwd
    !ls

is_kaggle()

## FUNCTIONS

#### DONWLOAD DATA

In [None]:
def download_data(image_size=512):
    """
    Download data from Kaggle
    :param image_size: 256, 512 or 1024
    :return: None
    """
    !pip install -q kaggle
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    if image_size == 256:
        !kaggle datasets download -d xhlulu/vinbigdata-chest-xray-resized-png-256x256
        !unzip vinbigdata-chest-xray-resized-png-256x256.zip -d data
    elif image_size == 512:
        !kaggle datasets download -d xhlulu/vinbigdata
        !unzip vinbigdata.zip -d data
    elif image_size == 1024:
        !kaggle datasets download -d xhlulu/vinbigdata-chest-xray-resized-png-1024x1024
        !unzip vinbigdata-chest-xray-resized-png-1024x1024.zip -d data
    else:
        print("Image size not supported")

    print(f"Number of train files: {len(glob(TRAIN_DIR+'*.png'))}")
    print(f"Number of test files: {len(glob(TEST_DIR+'*.png'))}")

#### DATA PREPARATION

In [None]:
def add_metadata(df_train, df_meta):
    df_train['height'] = np.nan
    df_train['width'] = np.nan

    for index, row in tqdm(df_train.iterrows()):
        meta_id = df_meta[df_meta['image_id']==df_train.loc[index,'image_id']].index[0]
        df_train.loc[index, 'height'] = df_meta.loc[meta_id, 'dim0']
        df_train.loc[index, 'width'] = df_meta.loc[meta_id, 'dim1']

In [None]:
def normalize_box(df_train):
    df_train['x_min'] = df_train.apply(lambda row: (row.x_min)/row.width, axis =1)
    df_train['y_min'] = df_train.apply(lambda row: (row.y_min)/row.height, axis =1)

    df_train['x_max'] = df_train.apply(lambda row: (row.x_max)/row.width, axis =1)
    df_train['y_max'] = df_train.apply(lambda row: (row.y_max)/row.height, axis =1)

    df_train['x_mid'] = df_train.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
    df_train['y_mid'] = df_train.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

    df_train['w'] = df_train.apply(lambda row: (row.x_max-row.x_min), axis =1)
    df_train['h'] = df_train.apply(lambda row: (row.y_max-row.y_min), axis =1)

    df_train['area'] = df_train['w']*df_train['h']

In [None]:
def add_image_path(df_train):
    df_train['image_path'] = TRAIN_DIR+df_train.image_id+'.png'

In [None]:
def get_fold_split(df_train, n_splits=5):
    gkf  = GroupKFold(n_splits=n_splits)
    df_train['fold'] = -1
    for fold, (train_idx, val_idx) in enumerate(gkf.split(df_train, groups = df_train.image_id.tolist())):
        df_train.loc[val_idx, 'fold'] = fold
    train_files = []
    val_files   = []
    val_files += list(df_train[df_train.fold==fold].image_path.unique())
    train_files += list(df_train[df_train.fold!=fold].image_path.unique())
    print(f"Train: {len(train_files)} - Val: {len(val_files)}")
    return train_files, val_files

In [None]:
def get_classes(df_train):
    class_ids, class_names = list(zip(*set(zip(df_train.class_id, df_train.class_name))))
    classes = list(np.array(class_names)[np.argsort(class_ids)])
    classes = list(map(lambda x: str(x), classes))
    return classes

In [None]:
def get_features(df_train):
    features = ['x_min', 'y_min', 'x_max', 'y_max', 'x_mid', 'y_mid', 'w', 'h', 'area']
    X = df_train[features]
    y = df_train['class_id']
    print(f'X shape: {X.shape} - y shape: {y.shape}')
    return X, y

In [None]:
def generate_labels(df_train):
    os.makedirs(ROOT_DIR + 'raw_labels', exist_ok = True)
    for index, row in tqdm(df_train.iterrows()):
        filename = row.image_path.split('/')[-1].split('.')[0]
        with open(ROOT_DIR+'raw_labels/'+filename+'.txt', 'a+') as f:
            f.write(f"{row.class_id} {row.x_mid} {row.y_mid} {row.w} {row.h}\n")

In [None]:
def copy_files(train_files, val_files, label_dir='raw_labels/'):
    os.makedirs(ROOT_DIR + 'labels/train', exist_ok = True)
    os.makedirs(ROOT_DIR + 'labels/val', exist_ok = True)
    os.makedirs(ROOT_DIR + 'images/train', exist_ok = True)
    os.makedirs( ROOT_DIR + 'images/val', exist_ok = True)
    label_dir = ROOT_DIR + label_dir
    for file in tqdm(train_files):
        shutil.copy(file, ROOT_DIR + 'images/train')
        filename = file.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, filename+'.txt'), ROOT_DIR + 'labels/train')

    for file in tqdm(val_files):
        shutil.copy(file, ROOT_DIR+ 'images/val')
        filename = file.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, filename+'.txt'), ROOT_DIR + 'labels/val')

In [None]:
def generate_yaml(classes):
    with open(ROOT_DIR+'train.txt', 'w') as f:
        for path in glob(ROOT_DIR +'images/train/*'):
                f.write(path + '\n')

        with open(ROOT_DIR+'val.txt', 'w') as f:
            for path in glob(ROOT_DIR + 'images/val/*'):
                f.write(path + '\n')

        data = dict(
            train =  ROOT_DIR + 'train.txt' ,
            val   =  ROOT_DIR + 'val.txt' ,
            nc    = 14,
            names = classes
            )

        with open(ROOT_DIR + 'project.yaml', 'w') as outfile:
            yaml.dump(data, outfile, default_flow_style=False)

        f = open(ROOT_DIR + 'project.yaml', 'r')

In [None]:
def data_preparation():
    df_train = pd.read_csv(TRAIN_CSV)
    df_train = df_train[df_train.class_id!=14].reset_index(drop = True) #remove No finding
    df_meta = pd.read_csv(TRAIN_META_CSV)
    print('Add metadata...')
    add_metadata(df_train, df_meta) #add metadata (width, height) to df_train
    print('Normalize box coordinates...')
    normalize_box(df_train) #normalize box coordinates
    add_image_path(df_train) #add image path to df_train
    train_files, val_files = get_fold_split(df_train) #split train and val (5 folds)
    print('Generate labels...')
    generate_labels(df_train) #generate labels for YOLO
    print('Copy files...')
    copy_files(train_files, val_files) # copy images and labels to labels and images folders with train and val subfolders
    generate_yaml(get_classes(df_train)) #generate project.yaml file and train.txt and val.txt files

#### YOLO

In [None]:
def setup_yolo(version=MODEL):
    if version == 'v5':
        !git clone https://github.com/ultralytics/yolov5  # clone repo
        os.chdir('yolov5')
        !pip install -qr requirements.txt
    elif version == 'v8':
        !git clone https://github.com/ultralytics/ultralytics  # clone repo
        os.chdir('ultralytics')
        !pip install -qr requirements.txt # install dependencies
        !python setup.py install
    print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

In [None]:
def train(version=MODEL, batch=16, epochs=30):
    if version == 'v5':
        !python train.py --img 640 --batch $batch --epochs $epochs --data $ROOT_DIR/project.yaml --weights yolov5x.pt --cache
    elif version == 'v8':
        !yolo train imgsz=640 batch=$batch epochs=$epochs data=$ROOT_DIR/project.yaml model=yolov8x.pt pretrained=True cache

In [None]:
def predict(version=MODEL, iou=0.4, conf=0.01):
    if version == 'v5':
        !python detect.py --weights runs/train/exp/weights/best.pt --img 640 --conf $conf --iou $iou --source $TEST_DIR --save-txt --save-conf --exist-ok
    elif version == 'v8':
        !yolo predict model='runs/detect/train/weights/best.pt' imgsz=640 conf=$conf iou=$iou source=$TEST_DIR save_txt=True save_conf=True exist_ok=True save=True

#### PLOTS

In [None]:
def plot_class_distribution(path):
    plt.figure(figsize = (20,20))
    plt.axis('off')
    plt.title('labels_correlogram')
    plt.imshow(plt.imread(path+'labels_correlogram.jpg'))
    plt.figure(figsize = (20,20))
    plt.axis('off')
    plt.title('labels')
    plt.imshow(plt.imread(path+'labels.jpg'))

In [None]:
def plot_batch_train(path):
    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'train_batch0.jpg'))
    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'train_batch1.jpg'))
    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'train_batch2.jpg'))

In [None]:
def plot_batch_val(path):
    fig, ax = plt.subplots(3, 2, figsize = (2*5,3*5), constrained_layout = True)
    for row in range(3):
        ax[row][0].imshow(plt.imread(f'{path}val_batch{row}_labels.jpg'))
        ax[row][0].set_xticks([])
        ax[row][0].set_yticks([])
        ax[row][0].set_title(f'Label {row}', fontsize = 12)

        ax[row][1].imshow(plt.imread(f'{path}val_batch{row}_pred.jpg'))
        ax[row][1].set_xticks([])
        ax[row][1].set_yticks([])
        ax[row][1].set_title(f'Predict {row}', fontsize = 12)

In [None]:
def plot_curves(path):
    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'F1_curve.png'))

    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'PR_curve.png'))

    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'P_curve.png'))

    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'R_curve.png'))

    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'results.png'))

In [None]:
def plot_confusion_matrix(path):
    plt.figure(figsize = (15, 15))
    plt.imshow(plt.imread(path+'confusion_matrix.png'))

In [None]:
def plot_results(version=MODEL):
    if version == 'v5':
        path = 'runs/train/exp/'
    elif version == 'v8':
        path = 'runs/detect/train/'
    else:
        print('Wrong version')
        return
    plot_class_distribution(path)
    plot_batch_train(path)
    plot_batch_val(path)
    plot_curves(path)
    plot_confusion_matrix(path)

#### Create submission

In [None]:
def yolo2voc(image_height, image_width, bboxes):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    voc  => [x1, y1, x2, y1]
    
    """ 
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]* image_height
    
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    bboxes[..., [2, 3]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]
    
    return bboxes

In [None]:
def generate_submission(version=MODEL):
    if version=="v5":
        directory = 'runs/detect/exp/'
    elif version=="v8":
        directory = 'runs/detect/predict/'
    else:
        print("version not found")
        return

    df_test = pd.read_csv(TEST_CSV)
    image_ids = []
    PredictionStrings = []

    for file_path in tqdm(glob(directory+'labels/*txt')):
        image_id = file_path.split('/')[-1].split('.')[0]
        w, h = df_test.loc[df_test.image_id==image_id,['width', 'height']].values[0]
        f = open(file_path, 'r')
        data = np.array(f.read().replace('\n', ' ').strip().split(' ')).astype(np.float32).reshape(-1, 6)
        data = data[:, [0, 5, 1, 2, 3, 4]]
        bboxes = list(np.round(np.concatenate((data[:, :2], np.round(yolo2voc(h, w, data[:, 2:]))), axis =1).reshape(-1), 1).astype(str))
        for idx in range(len(bboxes)):
            bboxes[idx] = str(int(float(bboxes[idx]))) if idx%6!=1 else bboxes[idx]
        image_ids.append(image_id)
        PredictionStrings.append(' '.join(bboxes))

    pred_df = pd.DataFrame({'image_id':image_ids,
                            'PredictionString':PredictionStrings})
    sub_df = pd.merge(df_test, pred_df, on = 'image_id', how = 'left').fillna("14 1 0 0 1 1")
    sub_df = sub_df[['image_id', 'PredictionString']]
    sub_df.to_csv(ROOT_DIR + 'yolo.csv',index = False)
    display(sub_df.head())
    print(f'Prediction saved at {ROOT_DIR}yolo.csv')

In [None]:
def zip_yolo_results(version=MODEL):
    if version=='v5':
        !zip -r $ROOT_DIR/results.zip runs/
    elif version=='v8':
        !zip -r $ROOT_DIR/results.zip runs/detect/
    else:
        print('Wrong version')
        return

## RUN

#### DOWNLOAD DATA

In [None]:
download_data(image_size=512)

#### DATA PREPARATOIN

In [None]:
data_preparation()

#### SETUP YOLO

In [None]:
setup_yolo(version=MODEL)

#### TRAIN YOLO ON DATA

In [None]:
train(version=MODEL, batch=16, epochs=30)

#### PLOT RESULTS OF TRAINING

In [None]:
plot_results(version=MODEL)

#### PREDICTION ON TEST DATA

In [None]:
predict(version=MODEL, iou=0.4, conf=0.01)

#### GENERATE SUBMISSION FOR COMPETITION

In [None]:
generate_submission(version=MODEL)

#### ZIP RESULTS

In [None]:
zip_yolo_results(version=MODEL)