# **Data Analysis**

* [Dependencies and imports](#section-one)
* [Read Data](#section-two)
    * [Study-level](#section-two-one)
    * [Image-level](#section-two-two)
    * [Merge study and image levels](#section-two-three)
* [Data Analysis](#section-three)
    * [Null values](#section-three-one)
    * [Duplicate values](#section-three-two)
    * [Number of images per study](#section-three-three)
    * [Number of bboxes per image](#section-three-four)
    * [Study-level class frequency](#section-three-five)
    * [Image-level class frequency](#section-three-six)
    * [Class distribution with no bbox](#section-three-seven)
    * [Number of boxes for the different classes](#section-three-eight)
    * [Relation between box size and number of boxes per image](#section-three-nine)
* [DICOM files](#section-four)
    * [View dicom files metadata](#section-four-one)
    * [Add metadata and image shape to train df](#section-four-two)
* [Explore images](#section-five)
    * [Number of images per study](#section-five-one)
    * [Studies with 3 images](#section-five-two)
    * [Studies with 4 images](#section-five-three)
    * [Studies with 5 images](#section-five-four)
    * [Studies with 6 images](#section-five-five)
    * [Studies with 7 images](#section-five-six)
    * [Studies with 9 images](#section-five-seven)
    * [Studies with 2 images](#section-five-eight)
    * [Box count per class](#section-five-nine)
* [Final train df](#section-six)
* [Create test df](#section-seven)

<a id="section-one"></a>
## **Dependencies and imports**

In [None]:
conda install gdcm -c conda-forge

In [None]:
!pip install --upgrade --force-reinstall numpy

In [None]:
import torch
import pydicom
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import cv2
import ast
import os
from termcolor import colored
from pydicom.pixel_data_handlers.util import apply_voi_lut

<a id="section-two"></a>
## **Read Data**

In [None]:
data_path = '../input/siim-covid19-detection'
output_path = './'

In [None]:
os.listdir(data_path)

In [None]:
train_path = os.path.join(data_path, 'train')
train_study_path = os.path.join(data_path, 'train_study_level.csv')
train_image_path = os.path.join(data_path, 'train_image_level.csv')

train_study_df = pd.read_csv(train_study_path)
train_image_df = pd.read_csv(train_image_path)

print("Train Study Shape: {}\nTrain Image Shape:{}".format(train_study_df.shape, train_image_df.shape))

We have more studies than images

<a id="section-two-one"></a>
#### **Study-level**

In [None]:
print("Train study:\n")
train_study_df

rename studies ids (get only study id)

In [None]:
# rename id col
train_study_df = train_study_df.rename(columns = {'id': 'study_id'}, inplace = False)
# get only the id (split by '_' to id and 'study' and get the first)
train_study_df["study_id"] = train_study_df["study_id"].apply(lambda x: x.split("_")[0])

rename class columns and add int label and class label column

In [None]:
NEGATIVE = 'negative'
TYPICAL = 'typical'
INDERTEMINATE = 'indeterminate'
ATYPICAL = 'atypical'

study_level_labels = {NEGATIVE:0, TYPICAL:1, INDERTEMINATE:2, ATYPICAL:3}

# rename columns for easier use
train_study_df = train_study_df.rename(columns = {'Negative for Pneumonia': NEGATIVE,
                                                  'Typical Appearance': TYPICAL,
                                                  'Indeterminate Appearance': INDERTEMINATE,
                                                  'Atypical Appearance': ATYPICAL}, inplace = False)

In [None]:
labels = []
int_labels = []

for index, row in train_study_df.iterrows():
    if row[NEGATIVE] == 1:
        labels.append(NEGATIVE)
        int_labels.append(study_level_labels[NEGATIVE])
    elif row[TYPICAL] == 1:
        labels.append(TYPICAL)
        int_labels.append(study_level_labels[TYPICAL])
    elif row[INDERTEMINATE] == 1:
        labels.append(INDERTEMINATE)
        int_labels.append(study_level_labels[INDERTEMINATE])
    elif row[ATYPICAL] == 1:
        labels.append(ATYPICAL)
        int_labels.append(study_level_labels[ATYPICAL])

train_study_df['study_level'] =  labels
train_study_df['int_label'] =  int_labels

In [None]:
train_study_df.head()

<a id="section-two-two"></a>
#### **Image-level**

In [None]:
print("Train image:\n") 
train_image_df

rename images ids (get only image id)

In [None]:
# rename id col
train_image_df = train_image_df.rename(columns = {'id': 'img_id'}, inplace = False)
# get only the id (split by '_' to id and 'image' and get the first)
train_image_df["img_id"] = train_image_df["img_id"].apply(lambda x: x.split("_")[0])

rename class columns and add split label to class, score and bboxes columns

In [None]:
NONE = 'none'
OPACITY = 'opacity'

IMAGE_LEVEL_LABEL_SIZE = 4

In [None]:
def get_num_boxes(sample):
    if(isinstance(sample['boxes'], str)): # not nan
        bboxes = ast.literal_eval(sample['boxes'])
        return len(bboxes)
    return 0 # no boxes

def get_coco_format(sample):
    if(isinstance(sample['boxes'], str)): # not nan
        boxes = ast.literal_eval(sample['boxes'])
        coco_boxes = []
        for box in boxes:
            coco_boxes.append([float(box['x']), float(box['y']), float(box['width']), float(box['height'])])
        return coco_boxes
    return np.nan

def get_label(sample, num_boxes):
    num_components = 6 # opacity/none, score, x1, y1, x2, y2
    if num_boxes==0:
        num_boxes = 1 # for no boxes we label [0,0,1,1]
    label_data = sample['label'].split(' ')
    label = label_data[0]
    confidence_scores = []
    pascal_voc_boxes = []
    
    for i in range(num_boxes):
        start = i*num_components + 1
        confidence_scores.append(float(label_data[start]))
        pascal_voc_boxes.append([float(label_data[start+1]), float(label_data[start+2]), float(label_data[start+3]), float(label_data[start+4])])
    return label, confidence_scores, pascal_voc_boxes

In [None]:
all_num_boxes = []
all_labels = []
all_scores = []
all_pascal_voc_boxes = []
all_coco_boxes = []

for index, row in train_image_df.iterrows():
    num_boxes = get_num_boxes(row)
    label, scores, pascal_voc_boxes = get_label(row, num_boxes)
    all_num_boxes.append(num_boxes)
    all_labels.append(label)
    all_scores.append(scores)
    all_pascal_voc_boxes.append(pascal_voc_boxes)
    all_coco_boxes.append(get_coco_format(row))

train_image_df['image_level'] = all_labels
train_image_df['confidence_scores'] = all_scores
train_image_df['pascal_voc_boxes'] = all_pascal_voc_boxes
train_image_df['coco_boxes'] = all_coco_boxes
train_image_df['num_boxes'] = all_num_boxes

In [None]:
train_image_df.head()

In [None]:
train_image_df = train_image_df.drop(columns=['label'])

add path of dicom file to df

In [None]:
def get_img_id(path):
    return path.split('/')[-1].split('.')[0] # extract img_id from path

def get_imgs_paths(root_dir):
    paths = {}
    for root, d_names, f_names in os.walk(root_dir):
        for f in f_names:
            img_id = get_img_id(os.path.join(root, f))
            paths[img_id] = os.path.join(root, f)
    return paths

In [None]:
paths = get_imgs_paths(data_path)
train_image_df['dicom_path'] = np.nan
for img_id, path in paths.items():
    train_image_df.loc[train_image_df['img_id'] == img_id, 'dicom_path'] = path

In [None]:
train_image_df.head()

<a id="section-two-three"></a>
#### **Merge study and image levels**

In [None]:
# merge image and study data
train_df = train_image_df.merge(train_study_df, left_on="StudyInstanceUID", right_on="study_id")

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(columns='study_id')
train_df.head()

<a id="section-three"></a>
## **Data Analysis**

In [None]:
# helper function to plot frequencies
def plot_frequency(ax, counts_dict, title, xlabel, ylabel, xgap=0, ygap=50):
    ax.bar(list(counts_dict.keys()), list(counts_dict.values()))
    for i, value in enumerate(counts_dict.values()):
        ax.text(i+xgap, value+ygap, str(value), color='#267DBE', fontweight='bold')

    ax.grid(axis='y', alpha=0.75)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.set_xticks(list(counts_dict.keys()))

<a id="section-three-one"></a>
#### **Null values**

In [None]:
train_df.isnull().sum()

<a id="section-three-two"></a>
#### **Duplicate values**

In [None]:
print("StudyInstanceUID is unique? {}".format(train_df['StudyInstanceUID'].is_unique))

<a id="section-three-three"></a>
#### **Number of images per study**

In [None]:
study_images_count = {}

for num_images in range(1, max(train_df.StudyInstanceUID.value_counts().values)+1):
    study_images_count[num_images] = np.count_nonzero(train_df.StudyInstanceUID.value_counts().values == num_images)
    
title = "Number images per study"
xlabel = "num images"
ylabel = "num studies"
fig, ax = plt.subplots(figsize=(8,6))
plot_frequency(ax, study_images_count, title, xlabel, ylabel, xgap=0.8)

<a id="section-three-four"></a>
#### **Number of bboxes per image**

In [None]:
boxes_count = {}
for num_boxes in range(max(train_image_df['num_boxes'].unique())+1):
    if num_boxes not in train_image_df['num_boxes'].unique():
        boxes_count[num_boxes] = 0
        continue
    boxes_count[num_boxes] = len(train_image_df[train_image_df['num_boxes'] == num_boxes])

title = "Number of boxes per image"
xlabel = "num boxes"
ylabel = "num images"
fig, ax = plt.subplots(figsize=(8,6))
plot_frequency(ax, boxes_count, title, xlabel, ylabel, xgap=-0.2)

We can see the data contains 2040 null boxes, wich should mean image label is none and consider more than 3 bboxes as outliers

In [None]:
# sanity check
null_boxes = train_df[train_df['boxes'].isna()]
none_labels = train_df[train_df['image_level'] == NONE]
print("There are {} null boxes and {} none labels".format(len(null_boxes), len(none_labels)))
print("Are all null boxes with none labels? {}".format(len(np.intersect1d(train_image_df.loc[train_image_df['boxes'].isna(), 'img_id'].values, train_image_df.loc[train_image_df['image_level'] == NONE, 'img_id'].values)) == len(null_boxes.values)))

<a id="section-three-five"></a>
#### **Study-level class frequency**

In [None]:
# check data sparsity for study
num_negatives = len(train_df[train_df[NEGATIVE] == 1])
num_typicals = len(train_df[train_df[TYPICAL] == 1])
num_indeterminates = len(train_df[train_df[INDERTEMINATE] == 1])
num_atypicals = len(train_df[train_df[ATYPICAL] == 1])

study_labels_count = {NEGATIVE:num_negatives, TYPICAL: num_typicals, 
                      INDERTEMINATE: num_indeterminates, ATYPICAL: num_atypicals}
title = "Number of studies per label"
xlabel = "study-level labels"
ylabel = "number of images"
fig, ax = plt.subplots(figsize=(8,6))

plot_frequency(ax, study_labels_count, title, xlabel, ylabel, xgap=-0.1)

<a id="section-three-six"></a>
#### **Image-level class frequency**

In [None]:
# check data sparsity 
num_none = len(train_image_df[train_df['image_level'] == NONE])
num_opacity = len(train_image_df[train_df['image_level'] == OPACITY])

study_labels_count = {NONE:num_none, OPACITY: num_opacity}

title = "Number of images per label"
xlabel = "num samples"
ylabel = "image-level labels"
fig, ax = plt.subplots(figsize=(6,5))
plot_frequency(ax, study_labels_count, title, xlabel, ylabel, xgap=-0.1)

<a id="section-three-seven"></a>
#### **Class distribution with no bbox**

In [None]:
labels_null_boxes_count = {}

for label in study_level_labels:
    labels_null_boxes_count[label] = len(train_df[((train_df['boxes'].isna()) & (train_df[label]==1))])
    
title = "Number of images with null boxes per study-level labels"
xlabel = "study-level labels"
ylabel = "num images with null boxes"
fig, ax = plt.subplots(figsize=(6,6))
plot_frequency(ax, labels_null_boxes_count, title, xlabel, ylabel, xgap=-0.1, ygap=20)

<a id="section-three-eight"></a>
#### **Number of boxes for the different classes**

In [None]:
negatives_boxes_count = {}
typicals_boxes_count = {}
atypicals_boxes_count = {}
inderteminates_boxes_count = {}

for label in study_level_labels:
    for num_boxes in range(max(train_image_df['num_boxes'].unique())+1):
        if label == NEGATIVE:
            negatives_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[NEGATIVE] == 1))])
        if label == TYPICAL:
            typicals_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[TYPICAL] == 1))])
        if label == ATYPICAL:
            atypicals_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[ATYPICAL] == 1))])
        else:
            inderteminates_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[INDERTEMINATE] == 1))])

            
            
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20,10))
plot_frequency(axes[0,0], negatives_boxes_count, "Box conuting for negatives", "num boxes", "num samples", -0.1, 20)
plot_frequency(axes[0,1], typicals_boxes_count, "Box conuting for typicals", "num boxes", "num samples", -0.2, 20)
plot_frequency(axes[1,0], atypicals_boxes_count, "Box conuting for atypicals", "num boxes", "num samples", -0.1, 5)
plot_frequency(axes[1,1], inderteminates_boxes_count, "Box conuting for inderteminates", "num boxes", "num samples", -0.1, 10)

<a id="section-three-nine"></a>
#### **Relation between box size and number of boxes per image**

In [None]:
num_boxes = np.arange(1, max(train_image_df['num_boxes'].unique())+1)
boxes_size = {}

for key in num_boxes:
    boxes_size[key] = []

for index, row in train_image_df.iterrows():
    num_boxes = row['num_boxes']
    if num_boxes != 0:
        boxes = row['coco_boxes']
        for box in boxes:
            x,y,w,h = box
            size = w*h
            boxes_size[num_boxes].append(size)

fig, ax = plt.subplots()
for num_boxes, boxes_size in boxes_size.items():
    ax.scatter([num_boxes]*len(boxes_size), boxes_size, label=num_boxes)

ax.ticklabel_format(style='plain', useOffset=False)
plt.xlabel('num of boxes')
plt.ylabel('sizes')    
plt.legend()
plt.show()

We understand that for num_boxes > 4 the boxes size are very small and those are outliers

In [None]:
train_df = train_df[train_df['num_boxes'] < 4]

<a id="section-four"></a>
## **DICOM files**

<a id="section-four-one"></a>
#### **View dicom files metadata**

In [None]:
pydicom.dcmread(train_df.loc[0, 'dicom_path'])

<a id="section-four-two"></a>
#### **Add metadata and image shape to train df**

In [None]:
def get_img(path):
        data_file = pydicom.dcmread(path)
        img = apply_voi_lut(data_file.pixel_array, data_file)
        #img = data_file.pixel_array.astype(float)

        if data_file.PhotometricInterpretation == "MONOCHROME1":
            img = np.amax(img) - img

        # Rescaling grey scale between 0-255 and convert to uint
        img = img - np.min(img)
        img = img / np.max(img)
        img = (img * 255).astype(np.uint8)

        return img

def get_img_id(path):
        return path.split('/')[-1].split('.')[0] # extract img_id from path

In [None]:
def get_observation_data(path):
    image_data = pydicom.read_file(path)
    img_id = get_img_id(path)
    
    # Dictionary to store the information from the image
    observation_data = {
        "img_id": img_id,
        "Rows" : image_data.get("Rows"),
        "Columns" : image_data.get("Columns"),
        "SOPClassUID" : image_data.get("SOPClassUID"),
        "SOPInstanceUID" : image_data.get("SOPInstanceUID"),
        "PatientID" : image_data.get("PatientID"),
        "PatientName" : image_data.get("PatientName"),
        "PatientSex" : image_data.get("PatientSex"),
        "PhotometricInterpretation" : image_data.get("PhotometricInterpretation"),
        "StudyInstanceUID" : image_data.get("StudyInstanceUID"),
        "SamplesPerPixel" : image_data.get("SamplesPerPixel"),
        "BitsAllocated" : image_data.get("BitsAllocated"),
        "BitsStored" : image_data.get("BitsStored"),
        "HighBit" : image_data.get("HighBit"),
        "PixelRepresentation" : image_data.get("PixelRepresentation"),
    }

    # String columns
    str_columns = ["ImageType", "Modality", "PatientSex", "BodyPartExamined"]
    for i in str_columns:
        observation_data[i] = str(image_data.get(i)) if i in image_data else None
        
    return observation_data

In [None]:
metadata = {}
shapes = []

for index, row in train_df.iterrows():
    metadata[index] = get_observation_data(row['dicom_path'])
    img = get_img(row['dicom_path'])
    shapes.append(img.shape)

train_df['image_shape'] = shapes

In [None]:
metadata_df = pd.DataFrame(metadata)
# swap the columns with indexes
metadata_df = metadata_df.transpose()

In [None]:
metadata_df.head()

In [None]:
# verify read meta data of all images
train_ids = np.array(train_df['img_id'].values) 
metadata_ids = np.array(metadata_df['img_id'].values)
len(np.setdiff1d(train_ids,metadata_ids))

In [None]:
metadata_df.to_csv(os.path.join(output_path, 'images_metadata.csv'), index=False)
#metadata_df = pd.read_csv(metadata_path)

In [None]:
# merge metadata and train df
train_df = pd.merge(train_df, metadata_df, how='inner', on=['img_id'])
train_df.head()

In [None]:
train_df = train_df.drop(columns=['StudyInstanceUID_y']).rename(columns = {'StudyInstanceUID_x': 'StudyInstanceUID'})
train_df.head()

<a id="section-five"></a>
## **Explore images**

In [None]:
def get_study_label(sample):
    if sample[NEGATIVE].values[0] == 1:
        return NEGATIVE
    if sample[TYPICAL].values[0] == 1:
        return TYPICAL
    if sample[INDERTEMINATE].values[0] == 1:
        return INDERTEMINATE
    return ATYPICAL

In [None]:
studies_per_imgs_count = {}
imgs_paths = {}
imgs_count = train_df.StudyInstanceUID.value_counts()

for num_images in range(2,10):
    studies_per_imgs_count[num_images] = imgs_count.where(imgs_count == num_images).dropna().keys()

for num_images, studies in studies_per_imgs_count.items():
    study_paths = {}
    for study in studies:
        paths = []
        for root, d_names, f_names in os.walk(os.path.join(train_path,study)):
            for f in f_names:
                studies_per_imgs_count[num_images]
                paths.append(os.path.join(root, f))
        study_paths[study] = paths
        
    imgs_paths[num_images] = study_paths

In [None]:
def show_studies_imgs_by_num_img(df, studies, num_images, figsize):
    fig, axes = plt.subplots(nrows=len(studies), ncols=num_images, figsize=figsize) 
    colors = {TYPICAL: (0,0,255), INDERTEMINATE: (0,255,0), ATYPICAL: (255,0,0)} # negatives have no boxes
    print("Typical: "+colored("Blue","blue")+"\nInderteminate: "+colored("Green","green")+"\nAtypical: "+colored("Red", "red"))
            
    for row, (study, paths) in enumerate(studies.items()):
        for col, path in enumerate(paths):
            img_id = get_img_id(path)
            img = get_img(path)
            # create new RGB image from original
            new_img = np.zeros((img.shape[0], img.shape[1], 3), dtype=img.dtype)
            new_img[:,:,:] = img[:,:,np.newaxis]
            row_df = df[df['img_id'] == img_id]
            study_label = get_study_label(row_df)
            
            if row_df['image_level'].values[0] != NONE:
                for box in df['pascal_voc_boxes'].values[0]:
                    xmin, ymin, xmax, ymax = int(box[0]), int(box[1]), int(box[2]), int(box[3])
                    new_img = cv2.rectangle(new_img,(xmin,ymin),(xmax,ymax),colors[study_label],20)

            if len(studies) > 1:
                axes[row,col].set_title("Study: {}\nImage ID: {}\nLabel: {}".format(study, img_id, study_label))
                axes[row,col].imshow(new_img)
            else:
                axes[col].set_title("Study: {}\nImage ID: {}\nLabel: {}".format(study, img_id, study_label))
                axes[col].imshow(new_img)

<a id="section-five-one"></a>
#### **Number of images per study**

In [None]:
study_count = {}
total_with_more_than_one = 0

for num_imgs,studies in imgs_paths.items():
    study_count[num_imgs] = len(studies)
    total_with_more_than_one += len(studies)

study_count[1] = len(train_df) - total_with_more_than_one

# sort dict by count
study_count = dict(sorted(study_count.items()))

title = "Number images per study"
xlabel = "num images"
ylabel = "num studies"
fig, ax = plt.subplots(figsize=(8,6))
plot_frequency(ax, study_count, title, xlabel, ylabel, xgap=0.8)

Let's explore studies with more than 1 image

<a id="section-five-two"></a>
#### **Studies with 3 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[3], 3, (15,100))

We can see in each study the images are the same and those wh have an image with a bounding box, just one of the images contains it, even though it's the same image. </br>
Remove images without bbox

In [None]:
imgs_paths[3].keys()

In [None]:
studies_without_bbox = ['e764f1cb364c', '0d9709b3af74', '7416b5cbc531']
studies_with_bbox = [study for study in list(imgs_paths[3].keys()) if study not in studies_without_bbox]

print(len(list(imgs_paths[3].keys())) == (len(studies_with_bbox)+len(studies_without_bbox)))

In [None]:
def compare_columns(studies, num_imgs):
    study_imgs = {}

    for study in studies:
        rows = train_df[train_df['StudyInstanceUID']==study]
        study_imgs[study] = rows.to_dict(orient='records')

    for study, samples in study_imgs.items():
        print("\033[1mStudy: {}\n\033[0m".format(study))
        for key in list(samples[0].keys()):
            values = [samples[i][key] for i in range(num_imgs)]
            print("\033[1m{}:\033[0m".format(key))
            for value in values:
                print(value)
            print()
        print("------------------------------------------------------------------------------------------------------------------------------------------")
    
compare_columns(studies_with_bbox, 3)

We can see all images of the same study has the same PatientID and PatientName and is the same scan, so we choose to keep only the one image with the bbox.

In [None]:
def drop_imgs_without_bbox(studies, df):
    rows_to_drop = []
    for study in studies:
        rows = df[df['StudyInstanceUID']==study]
        for row in rows.loc[rows['num_boxes']==0].index:
            rows_to_drop.append(row)
    
    return df.drop(labels=rows_to_drop, axis=0)

train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

Check the differences between images of same study without bounding box

In [None]:
compare_columns(studies_without_bbox, 3)

they are all duplicates, we choose to keep only one of them

In [None]:
def drop_duplicate_imgs(studies, df, num_imgs):
    rows_to_drop = []
    for study in studies:
        rows = df[df['StudyInstanceUID']==study]
        for i, row in enumerate(rows.index):
            if i%num_imgs!=0:
                rows_to_drop.append(row)
                
    return df.drop(labels=rows_to_drop, axis=0)

train_df = drop_duplicate_imgs(studies_without_bbox, train_df, 3)

We can see all columns have same values, except num_boxes

<a id="section-five-three"></a>
#### **Studies with 4 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[4], 4, (20,20))

In [None]:
imgs_paths[4].keys()

In [None]:
studies_without_bbox = ['74ba8f2badcb']
studies_with_bbox = [study for study in list(imgs_paths[4].keys()) if study not in studies_without_bbox]

print(len(list(imgs_paths[4].keys())) == (len(studies_with_bbox)+len(studies_without_bbox)))

In [None]:
compare_columns(studies_with_bbox, 4)

like before, for each study we keep only one image- the one with the bbox

In [None]:
train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

Check the differences between images of same study without bounding box

In [None]:
compare_columns(studies_without_bbox, 4)

Again, the images are duplicate- keep only one image per study

In [None]:
train_df = drop_duplicate_imgs(studies_without_bbox, train_df, 4)

<a id="section-five-four"></a>
#### **Studies with 5 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[5], 5, (20,20))

In [None]:
imgs_paths[5].keys()

In [None]:
studies_without_bbox = ['a0254bf8a96e']
studies_with_bbox = [study for study in list(imgs_paths[5].keys()) if study not in studies_without_bbox]

print(len(list(imgs_paths[5].keys())) == (len(studies_with_bbox)+len(studies_without_bbox)))

In [None]:
compare_columns(studies_with_bbox, 5)

In [None]:
train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

In [None]:
compare_columns(studies_without_bbox, 5)

In [None]:
train_df = drop_duplicate_imgs(studies_without_bbox, train_df, 5)

<a id="section-five-five"></a>
#### **Studies with 6 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[6], 6, (20,20))

In [None]:
imgs_paths[6].keys()

In [None]:
studies_with_bbox = ['8943d1d85097']

print(len(list(imgs_paths[6].keys())) == (len(studies_with_bbox)))

In [None]:
compare_columns(studies_with_bbox, 6)

In [None]:
train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

<a id="section-five-six"></a>
#### **Studies with 7 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[7], 7, (20,20))

In [None]:
imgs_paths[7].keys()

In [None]:
studies_with_bbox = ['a7335b2f9815']

print(len(list(imgs_paths[7].keys())) == (len(studies_with_bbox)))

In [None]:
train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

<a id="section-five-seven"></a>
#### **Studies with 9 images**

In [None]:
show_studies_imgs_by_num_img(train_df, imgs_paths[9], 9, (40,20))

In [None]:
imgs_paths[9].keys()

In [None]:
studies_with_bbox = ['0fd2db233deb']

print(len(list(imgs_paths[9].keys())) == (len(studies_with_bbox)))

In [None]:
train_df = drop_imgs_without_bbox(studies_with_bbox, train_df)

<a id="section-five-eight"></a>
#### **Studies with 2 images**

there are 206 studies with 2 images, instead of checking each study we assume the same pattern for them- duplicates, when both don't have any bbox or one of them have

In [None]:
rows_to_drop = []

for study, samples in imgs_paths[2].items():
    rows = train_df[train_df['StudyInstanceUID']==study]
    sample1= rows.iloc[0]
    sample2 = rows.iloc[1]
    if sample1['PatientID'] == sample2['PatientID']:  # if same patient id (duplicate)
        if sample1['num_boxes'] != sample2['num_boxes']: # if not same number of bounding boxes
            rows_to_drop.append(rows.loc[rows['num_boxes']==0].index.values[0]) # keep image with bounding box
        if sample1['num_boxes'] == sample2['num_boxes']: # if same number of boxes (probably no boxes)
            rows_to_drop.append(rows.loc[rows['img_id']==sample1['img_id']].index.values[0]) # keep only one of them (sample2)

In [None]:
train_df = train_df.drop(labels=rows_to_drop, axis=0)

Check null boxes after "cleaning" the data

In [None]:
train_df.head()

<a id="section-five-nine"></a>
#### **Box count per class**

In [None]:
negatives_boxes_count = {}
typicals_boxes_count = {}
atypicals_boxes_count = {}
inderteminates_boxes_count = {}

for label in study_level_labels:
    for num_boxes in range(max(train_image_df['num_boxes'].unique())+1):
        if label == NEGATIVE:
            negatives_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[NEGATIVE] == 1))])
        if label == TYPICAL:
            typicals_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[TYPICAL] == 1))])
        if label == ATYPICAL:
            atypicals_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[ATYPICAL] == 1))])
        else:
            inderteminates_boxes_count[num_boxes] = len(train_df[((train_df['num_boxes'] == num_boxes) & (train_df[INDERTEMINATE] == 1))])

            
            
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20,10))
plot_frequency(axes[0,0], negatives_boxes_count, "Box conuting for negatives", "num boxes", "num samples", -0.1, 20)
plot_frequency(axes[0,1], typicals_boxes_count, "Box conuting for typicals", "num boxes", "num samples", -0.2, 20)
plot_frequency(axes[1,0], atypicals_boxes_count, "Box conuting for atypicals", "num boxes", "num samples", -0.1, 5)
plot_frequency(axes[1,1], inderteminates_boxes_count, "Box conuting for inderteminates", "num boxes", "num samples", -0.1, 10)

In [None]:
# remove outliers for typicals
train_df = train_df.drop(index=train_df[(train_df['int_label']==1)&(train_df['num_boxes']<2)].index)

<a id="section-six"></a>
## **Final train df**

**drop columns from df and remain only image id, study id, number of boxes, boxes, labels and image path**

In [None]:
train_df.columns

In [None]:
train_df = train_df.drop(columns=['boxes', 'negative', 'typical', 'indeterminate', 'atypical',
                                  'Rows', 'Columns', 'SOPClassUID','SOPInstanceUID', 'PatientID', 'PatientName', 'PatientSex',
                                  'PhotometricInterpretation', 'SamplesPerPixel', 'BitsAllocated', 'BitsStored', 'HighBit', 
                                  'PixelRepresentation', 'ImageType', 'Modality', 'BodyPartExamined'])

In [None]:
train_df = train_df.rename(columns={'StudyInstanceUID':'study_id'})

In [None]:
train_df.head()

In [None]:
train_df.to_csv(os.path.join(output_path, 'train_df.csv'), index=False)

In [None]:
len(train_df)