In [None]:
!git clone https://github.com/ultralytics/yolov5.git

In [None]:
import pandas as pd
import os
import numpy as np
import shutil
import yaml
import matplotlib.pyplot as plt
import random
import cv2

from sklearn import model_selection
from tqdm import tqdm
from glob import glob

In [None]:
size = 512
TRAIN_LABELS_PATH = './vinbigdata/labels/train'
VAL_LABELS_PATH = './vinbigdata/labels/val'
TRAIN_IMAGES_PATH = './vinbigdata/images/train' #12000
VAL_IMAGES_PATH = './vinbigdata/images/val' #3000
External_DIR = f'../input/vinbigdata-{size}-image-dataset/vinbigdata/train' # 15000
os.makedirs(TRAIN_LABELS_PATH, exist_ok = True)
os.makedirs(VAL_LABELS_PATH, exist_ok = True)
os.makedirs(TRAIN_IMAGES_PATH, exist_ok = True)
os.makedirs(VAL_IMAGES_PATH, exist_ok = True)

In [None]:
original_df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
number_of_imageids = len(original_df['image_id'].values)
print(f'Total number of image_ids (train + validation) {number_of_imageids}')

number_of_images = len(os.listdir('../input/vinbigdata-chest-xray-abnormalities-detection/train'))
print(f'Total number of images (train + validation) {number_of_images}')

number_of_labels = len(os.listdir('../input/vinbigdata-yolo-labels-dataset/labels'))
print(f'Total number of labels (train + validation) {number_of_labels}')

In [None]:
df = pd.read_csv(f'../input/vinbigdata-{size}-image-dataset/vinbigdata/train.csv')
number_of_images = len(df['image_id'].values)
print(f'Total number of image ids (train + validation) {number_of_images}')

df = df[df.class_id!=14].reset_index(drop = True)
number_of_images = len(df['image_id'].values)
print(f'Total number of image ids after dropping normal images (train + validation) {number_of_images}')

df.head()

In [None]:
df = df.drop(columns=['class_name', 'rad_id', 'x_min', 'x_max', 'y_min', 'y_max', 'width', 'height', 'class_id']) # we only need image ids, labels are pre-made
df.head()

In [None]:
df_train, df_valid = model_selection.train_test_split(df, test_size=0.15, random_state=42, shuffle=True)

In [None]:
number_of_images = len(df_train['image_id'].values)
print(f'Total number of training image_ids {number_of_images}')

number_of_images = len(df_valid['image_id'].values)
print(f'Total number of validation image_ids {number_of_images}')


In [None]:
# need to delete duplicate image ids, len(labels) should be equal len(df.imageids.values), 

In [None]:
print(f'Total number of training images {len(df_train.image_id.unique())}')
print(f'Total number of validation images {len(df_valid.image_id.unique())}')

In [None]:
def preproccess_data(df, labels_path, images_path):
    for img_id in tqdm(df.image_id.unique()):
        shutil.copy(os.path.join('../input/vinbigdata-yolo-labels-dataset/labels', f"{img_id}"+'.txt'), labels_path)
        shutil.copy(os.path.join(f'/kaggle/input/vinbigdata-{size}-image-dataset/vinbigdata/train', f"{img_id}.png"), images_path)

In [None]:
preproccess_data(df_train, TRAIN_LABELS_PATH, TRAIN_IMAGES_PATH)
preproccess_data(df_valid, VAL_LABELS_PATH, VAL_IMAGES_PATH)

In [None]:
# check that data was preprocessed correctly
print(len(os.listdir(TRAIN_LABELS_PATH)))
print(len(os.listdir(TRAIN_IMAGES_PATH)))

print(len(os.listdir(VAL_LABELS_PATH)))
print(len(os.listdir(VAL_IMAGES_PATH)))

In [None]:
# credit / source https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-train
classes = [ 'Aortic enlargement',
            'Atelectasis',
            'Calcification',
            'Cardiomegaly',
            'Consolidation',
            'ILD',
            'Infiltration',
            'Lung Opacity',
            'Nodule/Mass',
            'Other lesion',
            'Pleural effusion',
            'Pleural thickening',
            'Pneumothorax',
            'Pulmonary fibrosis']

data = dict(
    train =  '../vinbigdata/images/train',
    val   =  '../vinbigdata/images/val',
    nc    = 14,
    names = classes
    )

with open('./yolov5/vinbigdata.yaml', 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)
    
# f = open('./yolov5/vinbigdata.yaml', 'r')
# print('\nyaml:')
# print(f.read())

In [None]:
%cd ./yolov5
!pip install -q -U -r requirements.txt
!pip install -q pycocotools>=2.0 seaborn>=0.11.0 thop

In [None]:
# !WANDB_MODE="dryrun" python train.py --img 1024 --batch 2 --epochs 25 --data ./vinbigdata.yaml --cfg models/yolov5x.yaml --weights yolov5x.pt --cache
# setting cache will make it run out of memory, max batch size 4 for 1024
!WANDB_MODE="dryrun" python train.py --img 640 --batch 16 --epochs 30 --data ./vinbigdata.yaml --cfg models/yolov5x.yaml --weights yolov5x.pt

In [None]:
test_df = pd.read_csv(f'/kaggle/input/vinbigdata-{size}-image-dataset/vinbigdata/test.csv')

In [None]:
test_dir = f'/kaggle/input/vinbigdata-{size}-image-dataset/vinbigdata/test'
weights_dir = './runs/train/exp/weights/best.pt'
os.listdir('./runs/train/exp/weights')

In [None]:
!python detect.py --weights $weights_dir\
--img 640\
--conf 0.005\
--iou 0.45\
--source $test_dir\
--save-txt --save-conf --exist-ok

In [None]:
# credit / source https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-infer
def yolo2voc(image_height, image_width, bboxes):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    voc  => [x1, y1, x2, y1]
    
    """ 
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]* image_height
    
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    bboxes[..., [2, 3]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]
    
    return bboxes

In [None]:
len(glob('runs/detect/exp/labels/*txt'))

In [None]:
# credit / source https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-infer
image_ids = []
PredictionStrings = []

def process_submission():
    for file_path in tqdm(glob('runs/detect/exp/labels/*txt')):
        image_id = file_path.split('/')[-1].split('.')[0] # extract image id
        w, h = test_df.loc[test_df.image_id==image_id,['width', 'height']].values[0] #  get the weight & height from  the test df
        f = open(file_path, 'r')  # open the label text file
        data = np.array(f.read().replace('\n', ' ').strip().split(' ')).astype(np.float32).reshape(-1, 6) # move all the labels to the same line..?
        data = data[:, [0, 5, 1, 2, 3, 4]]
        bboxes = list(np.round(np.concatenate((data[:, :2], np.round(yolo2voc(h, w, data[:, 2:]))), axis =1).reshape(-1), 1).astype(str))
        for idx in range(len(bboxes)):
            bboxes[idx] = str(int(float(bboxes[idx]))) if idx%6!=1 else bboxes[idx] # 6 is the length of  the prediction string, so..?
        image_ids.append(image_id)
        PredictionStrings.append(' '.join(bboxes))

    # credit / source: https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-infer
    pred_df = pd.DataFrame({'image_id':image_ids,
                            'PredictionString':PredictionStrings})
    sub_df = pd.merge(test_df, pred_df, on = 'image_id', how = 'left').fillna("14 1 0 0 1 1")
    sub_df = sub_df[['image_id', 'PredictionString']]
    sub_df.to_csv('/kaggle/working/submission_3.csv',index = False)
    sub_df.tail()

In [None]:
# !pip uninstall pandas
!pip install -q pandas==1.1.5

In [None]:
process_submission()

In [None]:
files = glob('runs/detect/exp/*png')
from mpl_toolkits.axes_grid1 import ImageGrid

def plot_sample_images():
    for _ in range(3):
        row = 4
        col = 4
        grid_files = random.sample(files, row*col)
        images     = []
        for image_path in tqdm(grid_files):
            img          = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
            images.append(img)

        fig = plt.figure(figsize=(col*5, row*5))
        grid = ImageGrid(fig, 111,  # similar to subplot(111)
                         nrows_ncols=(col, row),  # creates 2x2 grid of axes
                         axes_pad=0.05,  # pad between axes in inch.
                         )

        for ax, im in zip(grid, images):
            # Iterating over the grid returns the Axes.
            ax.imshow(im)
            ax.set_xticks([])
            ax.set_yticks([])
        plt.show()

In [None]:
!pip install -q Pillow==4.0.0
!pip install -q PIL
!pip install -q image

In [None]:
plot_sample_images()

In [None]:
plt.figure(figsize=(30,15))
plt.axis('off')
plt.imshow(plt.imread('runs/train/exp/confusion_matrix.png'));

In [None]:
plt.figure(figsize=(30,15))
plt.axis('off')
plt.imshow(plt.imread('runs/train/exp/results.png'));

In [None]:
!pwd

In [None]:
# load yolo submission
yolo = pd.read_csv('../submission_3.csv')
effnetb6 = pd.read_csv('/kaggle/input/vinbigdata-2class-prediction/2-cls test pred.csv') # AUC:0.98
pred = pd.merge(yolo, effnetb6, on = 'image_id', how = 'left')
low_thr  = 0.08
high_thr = 0.95

In [None]:
def filter_2cls(row, low_thr=low_thr, high_thr=high_thr):
    prob = row['target']
    if prob<low_thr:
        ## Less chance of having any disease
        row['PredictionString'] = '14 1 0 0 1 1'
    elif low_thr<=prob<high_thr:
        ## More change of having any diesease
        row['PredictionString']+=f' 14 {prob} 0 0 1 1'
    elif high_thr<=prob:
        ## Good chance of having any disease so believe in object detection model
        row['PredictionString'] = row['PredictionString']
    else:
        raise ValueError('Prediction must be from [0-1]')
    return row

In [None]:
sub = pred.apply(filter_2cls, axis=1)
sub[['image_id', 'PredictionString']].to_csv('../submission.csv',index = False)