# VinBigData Abnormalities Detection [EDA+Infer]
- < Reference Code > <br>
    - [Visual In-Depth EDA ‚Äì VinBigData Competition Data](https://www.kaggle.com/dschettler8845/visual-in-depth-eda-vinbigdata-competition-data/notebook)<br>
    - [VinBigData-CXR-AD YOLOv5 14 Class [infer]](https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-infer)<br>
    - [Quick data analysis with YOLOv5 at a glance](https://www.kaggle.com/jamsilkaggle/quick-data-analysis-with-yolov5-at-a-glance)
- if this helps, please do Upvote this code and the original üëçüèº
- The goal of this code is EDA of the VinBigData and to analyze the validset with YOLOv5 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

from tqdm.notebook import tqdm
from glob import glob
import shutil, os
import random
import cv2

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import plotly.express as px
import seaborn as sns

import torch
from IPython.display import Image, clear_output

# Import Dataset

In [None]:
train_dir = f'/kaggle/input/vinbigdata-512-image-dataset/vinbigdata/train'
weights_dir = '/kaggle/input/vinbigdata-cxr-ad-yolov5-14-class-train/yolov5/runs/train/exp/weights/best.pt'

train_df = pd.read_csv('../input/vinbigdata-512-image-dataset/vinbigdata/train.csv')
display(train_df.head(3))

# Check Train csv Length

In [None]:
print('train data total length : ', len(train_df))
print('train data unique length : ', len(train_df.image_id.unique()))
print('The average number of disease per patient :', len(train_df)/len(train_df.image_id.unique()))
print('The average number of disease per patient (without healty people):', len(train_df[train_df.class_id!=14])/len(train_df[train_df.class_id!=14].image_id.unique()))

# Image Number / Unique Abnormalities per Patient

In [None]:
color_palette = [px.colors.label_rgb(px.colors.convert_to_RGB_255(x)) for x in sns.color_palette("viridis", 15)]
fig = px.histogram(train_df.image_id.value_counts(),
                   title="<b>Distribution of Image Number per Patient",
                   color_discrete_sequence=color_palette,
                   log_y = True
                   )
fig.update_layout(showlegend=False,
                  xaxis_title="<b>Number of Images</b>",
                  yaxis_title="<b>Count of Unique Patient",)
fig.show()

fig = px.histogram(train_df.groupby('image_id')["class_name"].unique().apply(lambda x: len(x)),  
                   color_discrete_sequence=color_palette,
                   title="<b>Distribution of Unique Abnormalities per Patient",
                   log_y = True
                   )
fig.update_layout(showlegend=False,
                  xaxis_title="<b>Number of Unique Abnormalities</b>",
                  yaxis_title="<b>Count of Unique Patients</b>")
fig.show()

# Plot Label Distribution

In [None]:
fig = px.bar(train_df.class_name.value_counts().sort_index(), 
             color=train_df.class_name.value_counts().sort_index().index,
             color_discrete_sequence=color_palette,
             title="<b>Label Distribution</b>")
fig.update_layout(legend_title=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()

fig = px.bar(train_df[train_df.class_id!=14].class_name.value_counts().sort_index(), 
             color=train_df[train_df.class_id!=14].class_name.value_counts().sort_index().index,
             color_discrete_sequence=color_palette,
             title="<b>Label Distribution except No Finding</b>")
fig.update_layout(legend_title=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()

# Heatmap Showing Bounding Box Placement

In [None]:
bbox_df = train_df[train_df.class_id!=14].reset_index(drop=True)
bbox_df['frac_x_min'] = bbox_df.apply(lambda x: (x.x_min)/x.width, axis =1)
bbox_df['frac_y_min'] = bbox_df.apply(lambda x: (x.y_min)/x.height, axis =1)
bbox_df['frac_x_max'] = bbox_df.apply(lambda x: (x.x_max)/x.width, axis =1)
bbox_df['frac_y_max'] = bbox_df.apply(lambda x: (x.y_max)/x.height, axis =1)
bbox_df.head()

In [None]:
avg_width  = int(np.mean(bbox_df.width))
avg_height = int(np.mean(bbox_df.height))

heatmap_size = (avg_width, avg_height, 14)
heatmap = np.zeros((heatmap_size), dtype=np.int16)

bbox_np = bbox_df[["class_id", "frac_x_min", "frac_x_max", "frac_y_min", "frac_y_max"]].to_numpy()
bbox_np[:, 1:3] *= avg_width; bbox_np[:, 3:5] *= avg_height
bbox_np = np.floor(bbox_np).astype(np.int16)

label_dic = {i:train_df[train_df["class_id"]==i].iloc[0]["class_name"] for i in range(15)}

custom_cmaps = [matplotlib.colors.LinearSegmentedColormap.from_list(colors=[(0.,0.,0.), c, (0.95,0.95,0.95)], 
        name=f"custom_{i}") for i,c in enumerate(sns.color_palette("Spectral", 15))]
custom_cmaps.pop(8) # Remove No-Finding

for row in tqdm(bbox_np, total=bbox_np.shape[0]):
    heatmap[row[3]:row[4]+1, row[1]:row[2]+1, row[0]] += 1
    
fig = plt.figure(figsize=(20,25))
plt.suptitle("Heatmaps Showing Bounding Box Placement\n ", fontweight="bold", fontsize=16)
for i in range(15):
    plt.subplot(4, 4, i+1)
    if i==0:
        plt.imshow(heatmap.mean(axis=-1), cmap="bone")
        plt.title(f"Average of All Classes", fontweight="bold")
    else:
        plt.imshow(heatmap[:, :, i-1], cmap=custom_cmaps[i-1])
        plt.title(f"{label_dic[i-1]} ‚Äì id : {i}", fontweight="bold")
        
    plt.axis(False)
fig.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.show()

# Bounding Box Area Percentage of Image

In [None]:
bbox_df["frac_bbox_area"] = (bbox_df["frac_x_max"]-bbox_df["frac_x_min"])*(bbox_df["frac_y_max"]-bbox_df["frac_y_min"])
fig = px.box(bbox_df.sort_values(by="class_name"), x="class_name", y="frac_bbox_area", color="class_name", 
             color_discrete_sequence=color_palette, notched=True,
             labels={"class_id_as_str":"Class Name", "frac_bbox_area":"BBox Area (%)"},
             title="<b>Bounding Box Area % of Image")

fig.update_layout(showlegend=True,
                  yaxis_range=[-0.025,0.35],
                  legend_title_text=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()

---

# Make Valid Set with Group K Fold (except No Finding)

In [None]:
train_df = pd.read_csv('../input/vinbigdata-512-image-dataset/vinbigdata/train.csv')
train_df = train_df[train_df.class_id!=14].reset_index(drop = True)

train_df['fold'] = -1; val_fold = 4
group_kfold  = GroupKFold(n_splits = 5)
split = group_kfold.split(train_df, groups = train_df.image_id.tolist())
for fold, (train_idx, val_idx) in enumerate(split):
    train_df.loc[val_idx, 'fold'] = fold
val_df = train_df[train_df['fold']== val_fold]
display(val_df.head(3))

train_df['image_path'] = f'/kaggle/input/vinbigdata-512-image-dataset/vinbigdata/train/'+train_df.image_id+('.png')
train_files = []; val_files = []
train_files += list(train_df[train_df.fold!=fold].image_path.unique())
val_files += list(train_df[train_df.fold==fold].image_path.unique())
print('train file length : {}, valid file length : {}'.format(len(train_files), len(val_files)))

In [None]:
os.makedirs('/kaggle/working/vinbigdata/labels/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/labels/val', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/val', exist_ok = True)
label_dir = '/kaggle/input/vinbigdata-yolo-labels-dataset/labels'

for file in train_files:
    shutil.copy(file, '/kaggle/working/vinbigdata/images/train')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/train')
    
for file in val_files:
    shutil.copy(file, '/kaggle/working/vinbigdata/images/val')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/val')
    
val_dir = f'/kaggle/working/vinbigdata/images/val'

# YOLO v5 Setup

In [None]:
shutil.copytree('/kaggle/input/yolov5-official-v31-dataset/yolov5', '/kaggle/working/yolo5')
os.chdir('/kaggle/working/yolo5')

clear_output()
print('Setup complete. Using torch %s' % (torch.__version__))
print(torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU')

# Yolo v5 Inference

In [None]:
!python detect.py --weights $weights_dir\
--img 640\
--conf 0.15\
--iou 0.4\
--source $val_dir\
--save-txt --save-conf --exist-ok

# Plot Inference Results

In [None]:
files = glob('runs/detect/exp/*png')
for _ in range(1):
    row = 6; col = 4
    grid_files = random.sample(files, row*col)
    images     = []
    for image_path in tqdm(grid_files):
        img          = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        images.append(img)

    fig = plt.figure(figsize=(col*5, row*5))
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(row, col),  # creates 2x2 grid of axes
                     axes_pad=0.05,  # pad between axes in inch.
                     )

    for ax, im in zip(grid, images):
        # Iterating over the grid returns the Axes.
        ax.imshow(im)
        ax.set_xticks([])
        ax.set_yticks([])
    plt.show()

In [None]:
def yolo2voc(image_height, image_width, bboxes):
    bboxes = bboxes.copy().astype(float)
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]* image_height
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    bboxes[..., [2, 3]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]
    return bboxes

In [None]:
image_ids = []; PredictionStrings = []; classes = []; scores = []
x_min = []; y_min = []; x_max = []; y_max = []

for file_path in glob('runs/detect/exp/labels/*txt'):
    image_id = file_path.split('/')[-1].split('.')[0]
    w, h = val_df[val_df.image_id==image_id][['width', 'height']].values[0]
    f = open(file_path, 'r')
    data = np.array(f.read().replace('\n', ' ').strip().split(' ')).astype(np.float32).reshape(-1, 6)
    data = data[:, [0, 5, 1, 2, 3, 4]]
    bboxes = list(np.concatenate((data[:, :2], np.round(yolo2voc(h, w, data[:, 2:]))), axis =1).reshape(-1))#.astype(str))
    for i in range(len(bboxes)//6):
        image_ids.append(image_id)
        classes.append(int(bboxes[i*6]))
        scores.append(int(bboxes[i*6+1]))
        x_min.append(int(bboxes[i*6+2]))
        y_min.append(int(bboxes[i*6+3]))
        x_max.append(int(bboxes[i*6+4]))
        y_max.append(int(bboxes[i*6+5]))
        
pred_df = pd.DataFrame({'image_id':image_ids,'classes':classes,'scores':scores,'x_min':x_min,'y_min':y_min,'x_max':x_max,'y_max':y_max})
pred_df['class_name'] = pred_df.classes.apply(lambda x : label_dic[x])

# Plot Actual Distribution vs Predicted Distribution

In [None]:
fig = px.bar(val_df.class_name.value_counts().sort_index(), 
             color=val_df.class_name.value_counts().sort_index().index,
             color_discrete_sequence=color_palette,
             title="<b>Actual Distribution of Validation Dataset</b>")
fig.update_layout(legend_title=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()

fig = px.bar(pred_df.class_name.value_counts().sort_index(), 
             color=pred_df.class_name.value_counts().sort_index().index,
             color_discrete_sequence=color_palette,
             title="<b>Predicted Distribution of Validation Dataset</b>")
fig.update_layout(legend_title=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()

diff = val_df.class_name.value_counts().sort_index()-pred_df.class_name.value_counts().sort_index()
fig = px.bar(diff, 
             color=diff.index,
             color_discrete_sequence=color_palette,
             title="<b>The Difference between Actual Distribution and Predicted Distribution</b>")
fig.update_layout(legend_title=None,
                  xaxis_title="",
                  yaxis_title="")
fig.show()