In [None]:
# PIP Installs
!/opt/conda/bin/python3.7 -m pip install -q --upgrade pip      # Upgrade PIP
!pip install -q pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg # Install/Upgrade PyDicom Dependencies

# Machine Learning and Data Science Imports
import tensorflow_probability as tfp
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import tensorflow_hub as hub
from skimage import exposure
import pandas as pd; pd.options.mode.chained_assignment = None
import numpy as np
import scipy

# Built In Imports
from datetime import datetime
from glob import glob
import warnings
import IPython
import urllib
import zipfile
import pickle
import shutil
import string
import math
import tqdm
import time
import os
import gc
import re

# Visualization Imports
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from PIL import Image
import matplotlib
import plotly
import PIL
import cv2

# PRESETS
FIG_FONT = dict(family="Helvetica, Arial", size=14, color="#7f7f7f")
LABEL_COLORS = [px.colors.label_rgb(px.colors.convert_to_RGB_255(x)) for x in sns.color_palette("Spectral", 15)]
LABEL_COLORS_WOUT_NO_FINDING = LABEL_COLORS[:8]+LABEL_COLORS[9:]

# Other Imports
from pydicom.pixel_data_handlers.util import apply_voi_lut
from tqdm.notebook import tqdm
import pydicom

print("\n... IMPORTS COMPLETE ...\n")

In [None]:
# Define the root data directory
# 定义数据根目录
DATA_DIR = "/kaggle/input/vinbigdata-chest-xray-abnormalities-detection"

# Define the paths to the training and testing dicom folders respectively
# dicom数据
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")

# Capture all the relevant full train/test paths
# 取所有dicom数据
TRAIN_DICOM_PATHS = [os.path.join(TRAIN_DIR, f_name) for f_name in os.listdir(TRAIN_DIR)]
TEST_DICOM_PATHS = [os.path.join(TEST_DIR, f_name) for f_name in os.listdir(TEST_DIR)]
print(f"\n... The number of training files is {len(TRAIN_DICOM_PATHS)} ...")
print(f"... The number of testing files is {len(TEST_DICOM_PATHS)} ...")



# Define paths to the relevant csv files
# 取train.csv和submission.csv
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
# Create the relevant dataframe objects
train_df = pd.read_csv(TRAIN_CSV)
ss_df = pd.read_csv(SS_CSV)

print("\n\nTRAIN DATAFRAME\n\n")
display(train_df.head(3))

print("\n\nSAMPLE SUBMISSION DATAFRAME\n\n")
display(ss_df.head(3))

In [None]:
# dicom转np.array
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    """ Convert dicom file to numpy array 
    
    Args:
        path (str): Path to the dicom file to be converted
        voi_lut (bool): Whether or not VOI LUT is available
        fix_monochrome (bool): Whether or not to apply monochrome fix
        
    Returns:
        Numpy array of the respective dicom file 
        
    """
    # Use the pydicom library to read the dicom file
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to 
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
        
    # The XRAY may look inverted
    #   - If we want to fix this we can
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    
    # Normalize the image array and return
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

# 画图
def plot_image(img, title="", figsize=(8,8), cmap=None):
    """ Function to plot an image to save a bit of time """
    plt.figure(figsize=figsize)
    
    if cmap:
        plt.imshow(img, cmap=cmap)
    else:
        img
        plt.imshow(img)
        
    plt.title(title, fontweight="bold")
    plt.axis(False)
    plt.show()

# 获得图片id    
def get_image_id(path):
    """ Function to return the image-id from a path """
    return path.rsplit("/", 1)[1].rsplit(".", 1)[0]

# 计算坐标
def create_fractional_bbox_coordinates(row):
    """ Function to return bbox coordiantes as fractions from DF row """
    frac_x_min = row["x_min"]/row["img_width"]
    frac_x_max = row["x_max"]/row["img_width"]
    frac_y_min = row["y_min"]/row["img_height"]
    frac_y_max = row["y_max"]/row["img_height"]
    return frac_x_min, frac_x_max, frac_y_min, frac_y_max

# 画box
def draw_bboxes(img, tl, br, rgb, label="", label_location="tl", opacity=0.1, line_thickness=0):
    """ TBD 
    
    Args:
        TBD
        
    Returns:
        TBD 
    """
    rect = np.uint8(np.ones((br[1]-tl[1], br[0]-tl[0], 3))*rgb)
    sub_combo = cv2.addWeighted(img[tl[1]:br[1],tl[0]:br[0],:], 1-opacity, rect, opacity, 1.0)    
    img[tl[1]:br[1],tl[0]:br[0],:] = sub_combo

    if line_thickness>0:
        img = cv2.rectangle(img, tuple(tl), tuple(br), rgb, line_thickness)
        
    if label:
        # DEFAULTS
        FONT = cv2.FONT_HERSHEY_SIMPLEX
        FONT_SCALE = 1.666
        FONT_THICKNESS = 3
        FONT_LINE_TYPE = cv2.LINE_AA
        
        if type(label)==str:
            LABEL = label.upper().replace(" ", "_")
        else:
            LABEL = f"CLASS_{label:02}"
        
        text_width, text_height = cv2.getTextSize(LABEL, FONT, FONT_SCALE, FONT_THICKNESS)[0]
        
        label_origin = {"tl":tl, "br":br, "tr":(br[0],tl[1]), "bl":(tl[0],br[1])}[label_location]
        label_offset = {
            "tl":np.array([0, -10]), "br":np.array([-text_width, text_height+10]), 
            "tr":np.array([-text_width, -10]), "bl":np.array([0, text_height+10])
        }[label_location]
        img = cv2.putText(img, LABEL, tuple(label_origin+label_offset), 
                          FONT, FONT_SCALE, rgb, FONT_THICKNESS, FONT_LINE_TYPE)
    
    return img

In [None]:
# Create dictionary mappings
# 把病名改成数字表示
int_2_str = {i:train_df[train_df["class_id"]==i].iloc[0]["class_name"] for i in range(15)}
str_2_int = {v:k for k,v in int_2_str.items()}
int_2_clr = {str_2_int[k]:LABEL_COLORS[i] for i,k in enumerate(sorted(str_2_int.keys()))}

print("\n... Dictionary Mapping Class Integer to Class String Representation [int_2_str]...\n")
display(int_2_str)

print("\n... Dictionary Mapping Class String to Class Integer Representation [str_2_int]...\n")
display(str_2_int)

print("\n... Dictionary Mapping Class Integer to Color Representation [str_2_clr]...\n")
display(int_2_clr)

print("\n... Head of Train Dataframe After Dropping The Class Name Column...\n")
train_df.drop(columns=["class_name"], inplace=True)
display(train_df.head(5))

In [None]:
tmp_numpy = train_df.to_numpy()
image_ids = tmp_numpy[0]
class_ids = tmp_numpy[1]
rad_ids = tmp_numpy[2]
bboxes = tmp_numpy[3:]

In [None]:
bboxes

In [None]:
# TrainData是自定义的类，实现了很多功能

class TrainData():
    def __init__(self, df, train_dir, cmap="Spectral"):
        # Initialize
        self.df = df
        self.train_dir = train_dir
        
        # Visualization
        self.cmap = cmap
        self.pal = [tuple([int(x) for x in np.array(c)*(255,255,255)]) for c in sns.color_palette(cmap, 15)]
        self.pal.pop(8)
        
        # Store df components in individual numpy arrays for easy access based on index
        tmp_numpy = self.df.to_numpy()
        image_ids = tmp_numpy[0]
        class_ids = tmp_numpy[1]
        rad_ids = tmp_numpy[2]
        bboxes = tmp_numpy[3:]
        
        self.img_annotations = self.get_annotations(get_all=True)
        
        # Clean-Up
        del tmp_numpy; gc.collect();
        
    # 获取标记    
    def get_annotations(self, get_all=False, image_ids=None, class_ids=None, rad_ids=None, index=None):
        """ TBD 
        
        Args:
            get_all (bool, optional): TBD
            image_ids (list of strs, optional): TBD
            class_ids (list of ints, optional): TBD
            rad_ids (list of strs, optional): TBD
            index (int, optional):
        
        Returns:
        
        
        """
        if not get_all and image_ids is None and class_ids is None and rad_ids is None and index is None:
            raise ValueError("Expected one of the following arguments to be passed:" \
                             "\n\t\t– `get_all`, `image_id`, `class_id`, `rad_id`, or `index`")
        # Initialize
        tmp_df = self.df.copy()
        
        if not get_all:
            if image_ids is not None:
                tmp_df = tmp_df[tmp_df.image_id.isin(image_ids)]
            if class_ids is not None:
                tmp_df = tmp_df[tmp_df.class_id.isin(class_ids)]
            if rad_ids is not None:
                tmp_df = tmp_df[tmp_df.rad_id.isin(rad_ids)]
            if index is not None:
                tmp_df = tmp_df.iloc[index]
            
        annotations = {image_id:[] for image_id in tmp_df.image_id.to_list()}
        for row in tmp_df.to_numpy():
            
            # Update annotations dictionary
            annotations[row[0]].append(dict(
                img_path=os.path.join(self.train_dir, row[0]+".dicom"),
                image_id=row[0],
                class_id=int(row[1]),
                rad_id=int(row[2][1:]),
            ))
            
            # Catch to convert float array to integer array
            if row[1]==14:
                annotations[row[0]][-1]["bbox"]=row[3:]
            else:
                annotations[row[0]][-1]["bbox"]=row[3:].astype(np.int32)
        return annotations
    
    # 获取标记图
    def get_annotated_image(self, image_id, annots=None, plot=False, plot_size=(18,25), plot_title=""):
        if annots is None:
            annots = self.img_annotations.copy()
        
        if type(annots) != list:
            image_annots = annots[image_id]
        else:
            image_annots = annots
            
        img = cv2.cvtColor(dicom2array(image_annots[0]["img_path"]),cv2.COLOR_GRAY2RGB)
        for ann in image_annots:
            if ann["class_id"] != 14:
                img = draw_bboxes(img, 
                                ann["bbox"][:2], ann["bbox"][-2:], 
                                rgb=self.pal[ann["class_id"]], 
                                label=int_2_str[ann["class_id"]], 
                                opacity=0.08, line_thickness=4)
        if plot:
            plot_image(img, title=plot_title, figsize=plot_size)
        
        return img
    
    # 利用id画出标记图
    def plot_image_ids(self, image_id_list, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(image_ids=image_id_list)
        annotated_imgs = []
        n = len(image_id_list)
        
        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()
        
    # 利用类别画出标记图    
    def plot_classes(self, class_list, n=4, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(class_ids=class_list)
        annotated_imgs = []

        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()
        
    # 利用医生画出标记图
    def plot_radiologists(self, rad_id_list, n=4, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(rad_ids=rad_id_list)
        annotated_imgs = []

        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()
        
    # 输入的字符串与id画出标记图
    def plot_str_image_ids(self, image_id_list, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(image_ids=image_id_list)
        annotated_imgs = []
        n = len(image_id_list)
        
        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()

train_data = TrainData(train_df, TRAIN_DIR)

In [None]:
train_df.columns

In [None]:
train_df

In [None]:
# 利用id画出标记
IMAGE_ID_LIST=['4b56bc6d22b192f075f13231419dfcc8', '051132a778e61a86eb147c7c6f564dfe']
train_data.plot_str_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

In [None]:
#######

In [None]:
df_raw_high=pd.read_csv('../input/subtrain/submission_train.csv')

In [None]:
df_raw_high

In [None]:
import csv
df_raw_high=pd.read_csv('../input/subtrain-suit/submission_train.csv')
with open("model_high.csv","w") as csvfile: 
    writer = csv.writer(csvfile)

    #先写入columns_name
    writer.writerow(['image_id', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])
    content_list=[]
    for i in range(df_raw_high.shape[0]):
        image_id_str=df_raw_high.iloc[i]['image_id']
        prediction_str=df_raw_high.iloc[i]['PredictionString']
        list_prediction_str=prediction_str.split(" ")
#         print(image_id_str,len(list_prediction_str))
        for j in range(int(len(list_prediction_str)/6)):
            if int(list_prediction_str[j*6+0])==14:
#                 print('true')
                continue
            content_list.append([image_id_str,list_prediction_str[j*6+0],'R0',list_prediction_str[j*6+2],list_prediction_str[j*6+3],list_prediction_str[j*6+4],list_prediction_str[j*6+5]])
#             if list_prediction_str[j*6+0]==14:
#                 print('class_id',list_prediction_str[j*6+0])
#             print('class_id',list_prediction_str[j*6+0])
                
            
    
    #写入多行用writerows
    writer.writerows(content_list)

df_model_high=pd.read_csv('model_high.csv')
df_model_high

In [None]:
# df_raw_high=pd.read_csv('../input/csv-store/highscore.csv')
# df_raw_low=pd.read_csv('../input/csv-store/sub_3.18.csv')



# # import csv

# # with open("test.csv","w") as csvfile: 
# #     writer = csv.writer(csvfile)

# #     #先写入columns_name
# #     writer.writerow(['image_id', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])
# #     #写入多行用writerows
# #     writer.writerows([[0,1,3],[1,2,3],[2,3,4]])
    
# # writer

# # dsaigh=pd.read_csv('test.csv')
# # dsaigh



# import csv

# with open("model_high.csv","w") as csvfile: 
#     writer = csv.writer(csvfile)

#     #先写入columns_name
#     writer.writerow(['image_id', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])
#     content_list=[]
#     for i in range(df_raw_high.shape[0]):
#         image_id_str=df_raw_high.iloc[i]['image_id']
#         prediction_str=df_raw_high.iloc[i]['PredictionString']
#         list_prediction_str=prediction_str.split(" ")
#         print(image_id_str,len(list_prediction_str))
#         for j in range(int(len(list_prediction_str)/6)):
#             if int(list_prediction_str[j*6+0])==14:
# #                 print('true')
#                 continue
#             content_list.append([image_id_str,list_prediction_str[j*6+0],'R0',list_prediction_str[j*6+2],list_prediction_str[j*6+3],list_prediction_str[j*6+4],list_prediction_str[j*6+5]])
# #             if list_prediction_str[j*6+0]==14:
# #                 print('class_id',list_prediction_str[j*6+0])
# #             print('class_id',list_prediction_str[j*6+0])
                
            
    
#     #写入多行用writerows
#     writer.writerows(content_list)

# df_model_high=pd.read_csv('model_high.csv')
# df_model_high

# import csv

# with open("model_low.csv","w") as csvfile: 
#     writer = csv.writer(csvfile)

#     #先写入columns_name
#     writer.writerow(['image_id', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max'])
#     content_list=[]
#     for i in range(df_raw_low.shape[0]):
#         image_id_str=df_raw_low.iloc[i]['image_id']
#         prediction_str=df_raw_low.iloc[i]['PredictionString']
#         list_prediction_str=prediction_str.split(" ")
#         print(image_id_str,len(list_prediction_str)/6)
#         for j in range(int(len(list_prediction_str)/6)):
#             if int(list_prediction_str[j*6+0])==14:
# #                 print('true')
#                 continue
#             content_list.append([image_id_str,list_prediction_str[j*6+0],'R0',list_prediction_str[j*6+2],list_prediction_str[j*6+3],list_prediction_str[j*6+4],list_prediction_str[j*6+5]])
# #             if list_prediction_str[j*6+0]==14:
# #                 print('class_id',list_prediction_str[j*6+0])
# #             print('class_id',list_prediction_str[j*6+0])
                
            
    
#     #写入多行用writerows
#     writer.writerows(content_list)

# df_model_low=pd.read_csv('model_low.csv')
# df_model_low

In [None]:
train_df

In [None]:
high_test_data = TrainData(df_model_high, '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train')


In [None]:
# 利用id画出标记
IMAGE_ID_LIST=['18ee9ef3baea468de2087e0edd85e919', '7eda1e28e4cee7d8016276c87b76259f']
# use high score csv to create a datastruce(0.235)
high_test_data.plot_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

In [None]:
# 利用id画出标记
IMAGE_ID_LIST=['4b56bc6d22b192f075f13231419dfcc8', '051132a778e61a86eb147c7c6f564dfe']
train_data.plot_str_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)
high_test_data.plot_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

In [None]:
# IMAGE_ID_LIST=list(set(IMAGE_ID_LIST))
# IMAGE_ID_LIST

In [None]:
IMAGE_ID_LIST=['01ded16689539deb30d0981fafd18465',
 '0291515f5d14c34180a15712a55bf7bd',
 '037503b94eb68a16587a78bce365e681',
 '03a395d9fc03a6f8c5a50e693c20ab15']

In [None]:
# use low score csv to create a datastruce
low_test_data = TrainData(df_model_low, '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/test')

In [None]:
# IMAGE_ID_LIST=['008bdde2af2462e86fd373a445d0f4cd',
#  '00a2145de1886cb9eb88869c85d74080']

In [None]:
low_test_data.plot_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

In [None]:
# 利用id画出标记
IMAGE_ID_LIST=['18ee9ef3baea468de2087e0edd85e919', '7eda1e28e4cee7d8016276c87b76259f']
train_data.plot_str_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

In [None]:
#######

In [None]:
# 利用id画出标记
IMAGE_ID_LIST = train_df[train_df.class_id!=14].image_id[25:29].to_list()
IMAGE_ID_LIST=['18ee9ef3baea468de2087e0edd85e919', '7eda1e28e4cee7d8016276c87b76259f']
train_data.plot_image_ids(image_id_list=IMAGE_ID_LIST, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PLOT IMAGES CONTAINING A SINGLE CLASS</b>

Summary to be done later... 

**NOTE: Only the bounding boxes for the specified classes will be drawn... probably a TBD in the future as a possible arg**

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[7,], n=2, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PLOT IMAGES CONTAINING ONE OR MORE CLASSES FROM A LIST</b>

Summary to be done later... 

**NOTE: Images need not contain ALL the classes... potential future improvement or option.**

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[5,8,11], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PLOT IMAGES ANNOTATED BY A SINGLE OR MULTIPLE RADIOLOGIST(S)</b>

Summary to be done later... 

**NOTE: Same caveat as plotting based on class. Only bounding boxes annotated by the specified radiologist will be plotted**<br>
**NOTE: As radiologists often annotate `No tissue`, images may not contain ANY bounding boxes**

---

More detail to come later

In [None]:
train_data.plot_radiologists(rad_id_list=["R8"], verbose=False)

<h3 style="text-align: font-family: Verdana; font-size: 20px; font-style: normal; font-weight: normal; text-decoration: none; text-transform: none; letter-spacing: 2px; color: navy; background-color: #ffffff;">5.2  VISUALIZE EACH ABNORMALITY </h3>

---

We will leverage our created class to visualize 4 examples of every class...


* Aortic Enlargement <i><sub>(CLASS-ID: 0)</sub></i>
* Atelectasis <i><sub>(CLASS-ID: 1)</sub></i>
* Calcification <i><sub>(CLASS-ID: 2)</sub></i>
* Cardiomegaly <i><sub>(CLASS-ID: 3)</sub></i>
* Consolidation <i><sub>(CLASS-ID: 4)</sub></i>
* ILD <i><sub>(CLASS-ID: 5)</sub></i>
* Infiltration <i><sub>(CLASS-ID: 6)</sub></i>
* Lung Opacity <i><sub>(CLASS-ID: 7)</sub></i>
* Nodule/Mass <i><sub>(CLASS-ID: 8)</sub></i>
* Other Lesion <i><sub>(CLASS-ID: 9)</sub></i>
* Pleural Effusion <i><sub>(CLASS-ID: 10)</sub></i>
* Pleural Thickening <i><sub>(CLASS-ID: 11)</sub></i>
* Pneumothorax <i><sub>(CLASS-ID: 12)</sub></i>
* Pulmonary Fibrosis <i><sub>(CLASS-ID: 13)</sub></i>
* No Tissue Present <i><sub>(CLASS-ID: 14)</sub></i>

<b style="text-decoration: underline; font-family: Verdana;">AORTIC ENLARGMENT - (0)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[0,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">ATELECTASIS - (1)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[1,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">CALCIFICATION - (2)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[2,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">CARDIOMEGALY - (3)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[3,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">CONSOLIDATION - (4)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[4,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">ILD - (5)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[5,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">INFILTRATION - (6)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[6,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">LUNG OPACITY - (7)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[7,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">NODULE/MASS - (8)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[8,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">OTHER LESION - (9)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[9,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PLEURAL EFFUSION - (10)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[10,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PLEURAL THICKENING - (11)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[11,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PNEUMOTHORAX - (12)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[12,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">PULMONARY FIBROSIS - (13)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[13,], n=4, verbose=False)

<b style="text-decoration: underline; font-family: Verdana;">NO TISSUE - (14)</b>

Summary to be done later... 

---

More detail to come later

In [None]:
train_data.plot_classes(class_list=[14,], n=4, verbose=False)

<a style="text-align: font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; color: navy; background-color: #ffffff;" id="combining_annotations">6&nbsp;&nbsp;COMBINING/MERGING OVERLAPPING ANNOTATIONS (WIP)</a>

EXPLANATION COMING SOON – NOT SURE ABOUT THE HANDELING OF NODULES AND OTHER SMALL BBOXES THAT GET ENGULFED BY LARGER SIMILAR ANNOTATIONS

这里想要优化框的包含，大框包含众多小框的问题，具体没看懂

In [None]:
def calc_iou(bbox_1, bbox_2):
    # determine the coordinates of the intersection rectangle
    x_left = max(bbox_1[0], bbox_2[0])
    y_top = max(bbox_1[1], bbox_2[1])
    x_right = min(bbox_1[2], bbox_2[2])
    y_bottom = min(bbox_1[3], bbox_2[3])

    # Check if bboxes overlap at all (if not return 0)
    if x_right < x_left or y_bottom < y_top:
        return 0.0
    
    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    else:
        intersection_area = (x_right - x_left) * (y_bottom - y_top)
        
        # compute the area of both AABBs
        bbox_1_area = (bbox_1[2] - bbox_1[0]) * (bbox_1[3] - bbox_1[1])
        bbox_2_area = (bbox_2[2] - bbox_2[0]) * (bbox_2[3] - bbox_2[1])

        # compute the intersection over union by taking the intersection
        # area and dividing it by the sum of prediction + ground-truth
        # areas - the interesection area
        iou = intersection_area / float(bbox_1_area + bbox_2_area - intersection_area)
        return iou

def redux_bboxes(annots):
    def _get_inner_box(bboxes):
        xmin = max([box[0] for box in bboxes])
        ymin = max([box[1] for box in bboxes])
        xmax = min([box[2] for box in bboxes])
        ymax = min([box[3] for box in bboxes])
        if (xmax<=xmin) or (ymax<=ymin):
            return None
        else:
            return [xmin, ymin, xmax, ymax]
        
    valid_list_indices = [] 
    new_bboxes = []
    new_class_ids = []
    new_rad_ids = []
    
    for i, (class_id, rad_id, bbox) in enumerate(zip(annots["class_id"], annots["rad_id"], annots["bbox"])):
        intersecting_boxes = [bbox,]
        other_bboxes = [x for j,x in enumerate(annots["bbox"]) if j!=i]
        other_classes = [x for j,x in enumerate(annots["class_id"]) if j!=i]
        for j, (other_class_id, other_bbox) in enumerate(zip(other_classes, other_bboxes)):
            if class_id==other_class_id:
                iou = calc_iou(bbox, other_bbox)
                if iou>0.:
                    intersecting_boxes.append(other_bbox)

        if len(intersecting_boxes)>1:
            inner_box = _get_inner_box(intersecting_boxes)
            if inner_box and inner_box not in new_bboxes:
                new_bboxes.append(inner_box)
                new_class_ids.append(class_id)
                new_rad_ids.append(rad_id) 

    annots["bbox"] = new_bboxes
    annots["rad_id"] = new_rad_ids
    annots["class_id"] = new_class_ids
    
    return annots

# Make GT Dataframe
gt_df = train_df[train_df.class_id!=14]

# Apply Manipulations and Merger Functions
gt_df["bbox"] = gt_df.loc[:, ["x_min","y_min","x_max","y_max"]].values.tolist()
gt_df.drop(columns=["x_min","y_min","x_max","y_max"], inplace=True)
gt_df = gt_df.groupby(["image_id"]).agg({k:list for k in gt_df.columns if k !="image_id"}).reset_index()
gt_df = gt_df.apply(redux_bboxes, axis=1)

# Recreate the Original Dataframe Style
gt_df = gt_df.apply(pd.Series.explode).reset_index(drop=True).dropna()
gt_df["x_min"] = gt_df["bbox"].apply(lambda x: x[0])
gt_df["y_min"] = gt_df["bbox"].apply(lambda x: x[1])
gt_df["x_max"] = gt_df["bbox"].apply(lambda x: x[2])
gt_df["y_max"] = gt_df["bbox"].apply(lambda x: x[3])
gt_df.drop(columns=["bbox"], inplace=True)

# Add back in NaN Rows As A Single Annotation
gt_df = pd.concat([
    gt_df, train_df.loc[train_df['class_id'] == 14].drop_duplicates(subset=["image_id"])
]).reset_index(drop=True)

In [None]:
gt_data = TrainData(gt_df, TRAIN_DIR)
IMAGE_ID_LIST = gt_df[gt_df.class_id!=14].groupby("image_id") \
                                         .count() \
                                         .sort_values(by="class_id", ascending=False) \
                                         .index[0:100:20]

for i, IMAGE_ID in enumerate(IMAGE_ID_LIST):
    train_data.get_annotated_image(IMAGE_ID, annots=None, plot=True, plot_size=(18,22), plot_title=f"ORIGINAL – IMG #{i+1}")
    gt_data.get_annotated_image(IMAGE_ID, annots=None, plot=True, plot_size=(18,22), plot_title=f"REDUX VERSION – IMG #{i+1}")

In [None]:
gt_data = TrainData(gt_df, TRAIN_DIR)
IMAGE_ID_LIST = gt_df[gt_df.class_id!=14].groupby("image_id") \
                                         .count() \
                                         .sort_values(by="class_id", ascending=False) \
                                         .index[0:100:20]

for i, IMAGE_ID in enumerate(IMAGE_ID_LIST):
    train_data.get_annotated_image(IMAGE_ID, annots=None, plot=True, plot_size=(18,22), plot_title=f"ORIGINAL – IMG #{i+1}")
    gt_data.get_annotated_image(IMAGE_ID, annots=None, plot=True, plot_size=(18,22), plot_title=f"REDUX VERSION – IMG #{i+1}")