-------------------------------------------------------------
**Basic data exploration, paying particular attention to checking relationships in the data against our expectations, or to find out how things actually work. Gaining insight and checking for inconsistencies or glitches.**

-------------------------------------------------------------

In [None]:
%%capture
!conda install gdcm -c conda-forge -y

In [None]:
# Packages

import os, time, glob, re, pprint, random, math
from collections import defaultdict, Counter

import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as patches

import seaborn as sns

import cv2
import gdcm

import pydicom
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut

import imagehash
from PIL import Image as im

import tqdm

In [None]:
# Magic

%matplotlib inline 

In [None]:
# Pandas options

pd.set_option("display.expand_frame_repr", True)
pd.set_option('display.max_colwidth', 0)

In [None]:
# Global values

CODE_PATH = "./"
DATA_PATH = "../input/siim-covid19-detection"
TRAIN_PATH = os.path.join(DATA_PATH, "train")
TEST_PATH = os.path.join(DATA_PATH, "test")

In [None]:
# Utility functions

def print_title(s):
    
    # Print an underlined title
    
    print("%s\n%s" % (s, '-'*len(s)))
    
    
def na_fill(dtype):
    
    # Fill in missing values so that Seaborn countplot() will include them.
    # Also make sure the missing value for a string column is itself a string.
    
    if dtype == np.float64:
        result = -999
    elif dtype == np.int64:
        result = -999
    elif dtype == np.object:
        result = "NA"
    else:
        result = -999

    return result


def explode_list_col(df, col):
    
    other_cols = list(df.columns)
    other_cols.remove(col)
    out_df = df.set_index(other_cols)[col]
    out_df = out_df.apply(pd.Series)
    out_df = out_df.stack()
    out_df = out_df.reset_index()
    out_df = out_df.drop(f"level_{len(other_cols)}", axis=1)
    out_df = out_df.rename(columns={0 : col})
    
    return out_df
    

def cat_order(train_df, test_df, cols):
    
    # For comparison, we want the categories to be the same between train and test. 
    # We also want the categories to be in the same order for both. 
    # We need to make sure missing value types are consistent with the column data
    # type (else sort will complain), and make sure that Seaborn countplot() will
    # include them (it won't include np.nan). Dealing with the latter two issues 
    # is the job of fillna(). When the data is plotted, it needs to be run through
    # fillna() as well.

    category_order = {}
    n = len(cols)

    for col in cols:

        if col in train_cols:
            train_dtype = train_df[col].dtype
        else:
            train_dtype = None     
        if col in test_cols:
            test_dtype = test_df[col].dtype
        else:
            test_dtype = None
        if (train_dtype != None) and (test_dtype != None) and (train_dtype != test_dtype):
            print(f"***Incompatible train/test data types for column {col}")
            break
        
        col_vals = []
        if col in train_cols:
            col_vals += list(train_df[col].fillna(na_fill(train_dtype)).unique())
        if col in test_cols:
            col_vals += list(test_df[col].fillna(na_fill(test_dtype)).unique())
        col_vals = list(set(col_vals))
        col_vals.sort()
    
        category_order[col] = col_vals
        
    return category_order


def plot_categories(df, col, order, title, ax, xlabel=None, ylabel=None, pallete="Set3", scale="log"):

    data = df[col].fillna(na_fill(df[col].dtype))
    ax = sns.countplot(x=data, order=order, palette=pallete, ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
    ax.set_yscale(scale)
    if scale == "log":
        ax.set_ylim(bottom=0.5)
    else:
        ax.set_ylim(bottom=0)
    counts = data.value_counts()
    for i, p in enumerate(ax.patches):
        px = p.get_x()+0.05
        py = max(p.get_height()*0.9, 1.0)
        ax.annotate(counts.get(order[i], ""), (px, py), fontsize=10, rotation=90)
    ax.set_title(title, fontsize=15)
    if xlabel: ax.set_xlabel(xlabel)
    if ylabel: ax.set_ylabel(ylabel)
    
    return


def plot_cat_sub(df, col, sub_col, title, ax, scale="log"):

    ax = sns.countplot(data=df, x=col, hue=sub_col, ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
    ax.set_yscale(scale)
    if scale == "log":
        ax.set_ylim(bottom=0.5)
    else:
        ax.set_ylim(bottom=0)
    ax.set_title(title, fontsize=15)
        
    return


def plot_number_uniq_per(df, df_name, attr1, attr2, ax, title=None, palette=None, scale="log"):
    
    uniq_df = df.groupby(attr1)[[attr2]].nunique().reset_index()
    ax = sns.barplot(x=attr1, y=attr2, data=uniq_df, ax=ax, palette=palette)
    ax.set_yscale(scale)
    if scale == "log":
        ax.set_ylim(bottom=0.5)
    else:
        ax.set_ylim(bottom=0)
        
    counts = uniq_df[attr2]
    for i, p in enumerate(ax.patches):
        px = p.get_x()+0.05
        py = max(p.get_height()*0.9, 1.0)
        ax.annotate(uniq_df[attr2].iloc[i], (px, py), fontsize=10, rotation=90)

    if title:
        ax.set_title(title, fontsize=15)
    else:
        ax.set_title(f"{df_name}\nNumber of unique '{attr2}' values \nper '{attr1}' value", fontsize=15)
    ax.set_xlabel(attr1)
    ax.set_ylabel(f"# unique '{attr2}'")
    
    return uniq_df


def plot_number_having_n_uniq(df, df_name, attr1, attr2, ax, order=None, palette=None, scale="log"):

    nuniq = df.groupby(attr1)[attr2].nunique()
    ax = sns.countplot(x=nuniq, order=order, ax=ax, palette=palette)
    ax.set_yscale(scale)
    if scale == "log":
        ax.set_ylim(bottom=0.5)
    else:
        ax.set_ylim(bottom=0)
    ax.set_title(f"{df_name}\nNumber of '{attr1}' values \nhaving N unique values of '{attr2}'", fontsize=15)
    ax.set_xlabel(f"N = # of unique '{attr2}' values")
    ax.set_ylabel(f"# of '{attr1}' having N")

    return nuniq


def get_img(path, img_id):

    img_path = os.path.join(path, f"{img_id}.jpg")
    im = cv2.imread(img_path)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im = cv2.normalize(im, None, 0, 255, cv2.NORM_MINMAX)
    
    return im


def image_subplots(images, r, c, axes, titles=None):
    
    for idx, img in enumerate(images, 0):
        i = idx // c
        j = idx % c
        if titles:
            imshow(img, axes[i, j], title=titles[idx])
        else:
            imshow(img, axes[i, j])

    return
        

def imshow(image, ax=None, title=None, cmap=None):

    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(image, cmap=cmap)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='both', length=0)
    ax.set_xticklabels('')
    ax.set_yticklabels('')
    if title:
        ax.set_title(title)

    return ax


----
### Training Data

The train dataset comprises 6,334 chest scans in DICOM format, which were de-identified to protect patient privacy. All images were labeled by a panel of experienced radiologists for the presence of opacities as well as overall appearance.

Note that all images are stored in paths with the form `study/series/image`. The study ID here relates directly to the study-level predictions, and the image ID is the ID used for image-level predictions.

----

In [None]:
# Read the image metadata

train_image_df = pd.read_csv(os.path.join(DATA_PATH,'train_image_level.csv'))
train_study_df =  pd.read_csv(os.path.join(DATA_PATH,'train_study_level.csv'))

----
`train_study_level.csv`

- **id** --- unique study identifier  
- **Negative for Pneumonia** --- 1 if the study is negative for pneumonia, 0 otherwise  
- **Typical Appearance** --- 1 if the study has this appearance, 0 otherwise  
- **Indeterminate Appearance**  --- 1 if the study has this appearance, 0 otherwise  
- **Atypical Appearance**  --- 1 if the study has this appearance, 0 otherwise  

----

In [None]:
train_study_df.head()

In [None]:
id_col = train_study_df.loc[:, "id"].apply(lambda s: s.split('_')[0])
other_cols = train_study_df[["Negative for Pneumonia", "Typical Appearance", 
                             "Indeterminate Appearance", "Atypical Appearance"]]
train_study_df = pd.concat([id_col, other_cols], axis=1)
train_study_df.head()

----
`train_image_level.csv`

- **id** --- unique image identifier  
- **boxes** --- bounding boxes in easily-readable dictionary format  
- **label** --- the correct prediction label for the provided bounding boxes  
----

In [None]:
train_image_df.head()

In [None]:
# Get the image and study ID's from the file paths.

image_paths = glob.glob(os.path.join(TRAIN_PATH, "*/*/*"))

p = re.compile(os.path.join(TRAIN_PATH, "([^/]*)/([^/]*)/([^.]*)(.dcm)"))

study_ids = []
image_ids = []
id_dict = defaultdict(list)
for ip in image_paths:
    m = p.match(ip)
    study_ids.append((m.group(1)))
    image_ids.append((m.group(3)))
    id_dict[m.group(1)].append((m.group(3), ip))
    
if (len(image_ids) != len(set(image_ids))):
    print("there are duplicate image IDs")
else:
    print("there are no duplicate image IDs")
    
print(f"number of image IDs: {len(set(image_ids))}")
print(f"number of studies: {len(set(study_ids))}")

In [None]:
# Check that the metadata is consistent with the file path derived study and image ID's.

meta_image_ids = [id.split('_')[0] for id in train_image_df["id"]]
meta_study_ids = [id.split('_')[0] for id in train_study_df["id"]]
meta_study_instance_uids = list(set(train_image_df["StudyInstanceUID"]))

print(f"there are {len(meta_image_ids)} metadata image IDs")
print(f"there are {len(meta_study_ids)} metadata study IDs")
print(f"there are {len(meta_study_instance_uids)} metadata StudyInstanceUIDs")
print()

if (len(meta_image_ids) != len(set(meta_image_ids))):
    print("there are duplicate metadata image IDs")
if (set(meta_image_ids) != set(image_ids)):
    print("metadata image IDs mismatch file path image IDs")
    s1 = set(meta_image_ids)
    s2 = set(image_ids)
    print(f"There are {len(list(s1.difference(s2)))} extra metadata image IDs")
    print(f"There are {len(list(s2.difference(s1)))} extra file path image IDs")
    print()
    
if (set(meta_study_ids) != set(study_ids)):
    print("metadata study IDs mismatch file path study IDs")
    s1 = set(meta_study_ids)
    s2 = set(study_ids)
    print(f"There are {len(list(s1.difference(s2)))} extra metadata study IDs")
    print(f"There are {len(list(s2.difference(s1)))} extra file path study IDs")
    print()
    
if (set(meta_study_instance_uids) != set(study_ids)):
    print("metadata StudyInstanceUIDs mismatch file path study IDs")
    s1 = set(meta_study_instance_uids)
    s2 = set(study_ids)
    print(f"There are {len(list(s1.difference(s2)))} extra metadata study instace UIDs")
    print(f"There are {len(list(s2.difference(s1)))} extra file path study IDs")
    print()
    
if (set(meta_study_instance_uids) != set(meta_study_ids)):
    print("metadata StudyInstanceUIDs mismatch metadata study IDs")
    s1 = set(meta_study_instance_uids)
    s2 = set(meta_study_ids)
    print(f"There are {len(list(s1.difference(s2)))} extra metadata StudyInstanceUIDs")
    print(f"There are {len(list(s2.difference(s1)))} extra metadata study IDs")
    print()
    
# Check consistency of metadata mapping between studies and images with file mapping
incon = False
for study_id, image_info in id_dict.items():
    for (image_id, _) in image_info:
        meta_image_id = image_id + "_image"
        meta_study_id = train_image_df.loc[train_image_df["id"] == meta_image_id, "StudyInstanceUID"].iloc[0]
        if study_id != meta_study_id:
            incon = True
            print("metadata ID mapping does not match file mapping")
            print(image_id, study_id, meta_study_id)
if (not incon):
    print("study/image ID relationships are consistent between metadata and file paths")

In [None]:
# Get all the field names in the DICOM data.

dicom_field_names = set([])
for ip in image_paths:
    dicom_dataset = pydicom.dcmread(ip)
    for dfld in dicom_dataset:
        dicom_field_names.add(dfld.name)

In [None]:
dicom_field_names = list(dicom_field_names)
pprint.pprint(dicom_field_names)

In [None]:
# Display typical DICOM dataset content.

image_file_path = image_paths[0]
dicom_dataset = pydicom.dcmread(image_file_path)
print_title(image_file_path)
for dfld in dicom_dataset:
    print(dfld)

In [None]:
# Skip fields not wanted in the DICOM dataframe.

skip_fields = ["Specific Character Set", 
               "De-identification Method", 
               "De-identification Method Code Sequence",
               "Pixel Data"]

In [None]:
# Import DICOM data into dataframe.

p = re.compile(os.path.join(TRAIN_PATH, "([^/]*)/([^/]*)/([^.]*)(.dcm)"))

data_dict = defaultdict(list)

for ip in image_paths:
    m = p.match(ip)
    study_id = m.group(1)
    series_id = m.group(2)
    image_id = m.group(3)
    dicom_dataset = pydicom.dcmread(ip)
    file_fields = dict([(dfld.name, dfld) for dfld in dicom_dataset])
    data_dict["study_id"].append(study_id)
    data_dict["series_id"].append(series_id)
    data_dict["image_id"].append(image_id)
    for dfld_name in dicom_field_names:
        if (dfld_name in skip_fields):
            continue
        dfld = file_fields.get(dfld_name, None)
        if dfld:
            data_dict[dfld_name].append(str(dfld.value))
        else:
            data_dict[dfld_name].append(np.nan)
            
dicom_df = pd.DataFrame(data_dict)

In [None]:
print(f"there are {len(dicom_df)} DICOM rows")
dicom_df.head()

In [None]:
# DICOM attribute value summary

dicom_df.describe().transpose()

In [None]:
# Test consistency of ID fields

print(set(dicom_df.loc[:, "study_id"] == dicom_df.loc[:, "Study Instance UID"]))
print(set(dicom_df.loc[:, "series_id"] == dicom_df.loc[:, "Series Instance UID"]))
print(set(dicom_df.loc[:, "image_id"] == dicom_df.loc[:, "SOP Instance UID"]))

In [None]:
# Display info on NA values

dicom_df.isna().describe().transpose()

In [None]:
# Display the unique values and their counts for fields with <100 unique values

unique_col_vals = {}
for col in dicom_df.columns:
    colcount = Counter(list(dicom_df.loc[:, col]))
    if len(colcount) < 100:
        unique_col_vals[col] = colcount
        print_title(f"{col}:")
        for val, count in sorted(colcount.items(), key=lambda item: item[1], reverse=True):
            print(f"\t{count}:\t{val}")
        print()

In [None]:
# Show some key relationships among DICOM features

fig, ax = plt.subplots(8, 2, figsize=(15, 25))

_ = plot_number_having_n_uniq(dicom_df, "", "Patient ID", "Patient's Sex", ax[0, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "Patient ID", "Patient's Name", ax[0, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "Patient ID", "study_id", ax[1, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Study Date", ax[1, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "series_id", ax[2, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "Patient ID", "series_id", ax[2, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "series_id", "image_id", ax[3, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "image_id", ax[3, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "Patient ID", "image_id", ax[4, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Patient ID", ax[4, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Imager Pixel Spacing", ax[5, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Private Creator", ax[5, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Image Type", ax[6, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Photometric Interpretation", ax[6, 1], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Modality", ax[7, 0], palette="Set3")
_ = plot_number_having_n_uniq(dicom_df, "", "study_id", "Body Part Examined", ax[7, 1], palette="Set3")

fig.tight_layout()

In [None]:
# Join the image and study metadata on study ID

image_study_df = train_image_df.merge(train_study_df, left_on="StudyInstanceUID", right_on="id")
image_study_df.head()

In [None]:
# Check that each study has one and only one determination.

idx1 = image_study_df[image_study_df["Negative for Pneumonia"] + image_study_df["Typical Appearance"] +  
                image_study_df["Indeterminate Appearance"] + image_study_df["Atypical Appearance"] > 1].index
if (len(idx1) > 1): 
    print(f"{len(idx1)} studies have multiple determinations")
else:
    print("No studies have multiple determinations")

idx2 = image_study_df[image_study_df["Negative for Pneumonia"] + image_study_df["Typical Appearance"] +  
                image_study_df["Indeterminate Appearance"] + image_study_df["Atypical Appearance"] < 1].index
if (len(idx2) > 1): 
    print(f"{len(idx2)} studies have zero determinations")
else:
    print("No studies have zero determinations")

In [None]:
# Combine image and study metadata and massage the data a bit

label_re = re.compile("(opacity 1(\s(\-){0,1}\d*(\.){0,1}\d*){4})")

def reformat_label(old_label):
    new_label =[op[0].split() for op in label_re.findall(old_label)]
    return new_label

def reformat_boxes(old_boxes):
    new_boxes = [(box["x"], box["y"], box["width"], box["height"]) for box in eval(old_boxes)]
    return new_boxes

def determ_string(s):
    if s[0] == 1: return "NegPnu"
    if s[1] == 1: return "Typ"
    if s[2] == 1: return "Indeterm"
    if s[3] == 1: return "ATyp"
    return "none"

id_col = image_study_df["id_x"].apply(lambda s: s.split('_')[0]).rename("image_id")
box_col = image_study_df["boxes"].fillna('[]').rename("new_boxes").apply(reformat_boxes)
box_count_col = image_study_df["boxes"].fillna('[]').apply(lambda s: len(eval(s))).rename("box_count")
new_label_col = image_study_df["label"].fillna('[]').apply(reformat_label).rename("new_label")
label_count_col = new_label_col.apply(lambda s: len(s)).rename("label_count")
determ_col = image_study_df[["Negative for Pneumonia", 
                             "Typical Appearance",
                             "Indeterminate Appearance",
                             "Atypical Appearance"]].apply(determ_string, axis=1).rename("determ")

other_cols = image_study_df[["boxes", "label", "StudyInstanceUID", "Negative for Pneumonia", 
                             "Typical Appearance", "Indeterminate Appearance", "Atypical Appearance"]]
image_study_df = pd.concat([id_col, box_col, box_count_col, new_label_col, label_count_col, determ_col, 
                            other_cols], axis=1)

image_study_df = image_study_df[["image_id", "boxes", "new_boxes", "box_count", "label", "new_label", 
                                 "label_count", "determ", "StudyInstanceUID", 
                                "Negative for Pneumonia", "Typical Appearance", 
                                "Indeterminate Appearance", "Atypical Appearance"]]

image_study_df = image_study_df.rename(columns={"StudyInstanceUID": "study_id"})
image_study_df.head()

In [None]:
# Check that each image has the same number of boxes and labels.

idx = image_study_df[image_study_df["box_count"] != image_study_df["label_count"]].index
        
if (len(idx) > 0): 
    print(f"{len(idx)} box/label mismatch found")
else:
    print("No studies have box/label mismatch")

In [None]:
# Check that the label regex we used hasn't thown out any labels (and so our regex accurately capture the label syntax)    

join_label = image_study_df["new_label"].rename("join_label").apply(lambda ll: ' '.join([' '.join(l) for l in ll]))
temp_df = pd.concat([image_study_df["label"], join_label], axis=1)
idx = temp_df[~((temp_df["label"] == temp_df["join_label"]) | 
              ((temp_df["label"] == "none 1 0 0 1 1") & (temp_df["join_label"] == "")))].index

if (len(idx) > 0): 
    print(f"{len(idx)} missing labels found")
else:
    print("No missing labels")

In [None]:
# Remove unneeded columns from this dataframe

image_study_df = image_study_df[["image_id", "new_boxes", "box_count",
                                 "determ", "study_id"]].rename(columns={"new_boxes" : "boxes"})
image_study_df.head()

In [None]:
# Join in some patient and image features

join_in_df = dicom_df[["image_id", "Patient ID", "Patient's Sex", "Patient's Name", 
                        "Body Part Examined", "Rows", "Columns"]]
    
image_study_patient_df = image_study_df.merge(join_in_df, on="image_id")

image_study_patient_df.head()

In [None]:
# Check the number of images, number of studies per determination

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

_ = plot_number_uniq_per(image_study_patient_df, "", "determ", "image_id", ax[0], palette="Set3", scale="linear")
_ = plot_number_uniq_per(image_study_patient_df, "", "determ", "study_id", ax[1], palette="Set3", scale="linear")

fig.tight_layout()

In [None]:
# Within each determination, how many images have a given number of boxes.

fig, ax = plt.subplots(2, 2, figsize=(15, 8))
for i, determ in enumerate(["Typ", "NegPnu", "ATyp", "Indeterm"]):
    df = image_study_patient_df[image_study_patient_df["determ"] == determ]
    plot_categories(df, "box_count", range(0, 5), f"{determ} # images with box_count", ax[i%2, i//2], \
                    ylabel="image count", scale="linear")
fig.tight_layout()

In [None]:
# Number of patients having a given number of determinations.
# Number of studies having a given number of determinations.

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

_ = plot_number_having_n_uniq(image_study_patient_df, "", "Patient ID", "determ", ax[0], 
                              palette="Set3", scale="linear")
_ = plot_number_having_n_uniq(image_study_patient_df, "", "study_id", "determ", ax[1], 
                              palette="Set3", scale="linear")
fig.tight_layout()

In [None]:
# Distributions of determinations by gender. Number of studies per gender.

fig, ax = plt.subplots(3, 1, figsize=(8, 10))
for i, sex in enumerate(["M", "F"]):
    df = image_study_patient_df[image_study_patient_df["Patient's Sex"] == sex]
    
    _ = plot_number_uniq_per(df, "", "determ", "study_id", ax[i], 
                         title=f"Patient's Sex={sex}:\n # studies with given determination", palette="Set3", 
                         scale="linear")

_ = plot_number_uniq_per(image_study_patient_df, "", "Patient's Sex", "study_id", ax[2], 
                         title="# studies per gender", palette="Set3", scale="linear")    

fig.tight_layout()

In [None]:
# Explode the box lists into seperate rows

image_study_patient_boxes_df = explode_list_col(image_study_patient_df, "boxes")
image_study_patient_boxes_df.head()

In [None]:
# Check that all boxes in the metadata are valid

def valid_boxes(s):
    (x1, y1, w, h) = s[0]
    x2 = x1 + w
    y2 = y1 + h
    r = float(s[1])
    c = float(s[2])
    invalid = not((w > 0) and (h > 0) and
    (x1 >= 0) and (x1 <= c) and (y1 >= 0) and (y1 <= r) and
    (x2 >= 0) and (x2 <= c) and (y2 >= 0) and (y2 <= r))
    return invalid
    
image_study_patient_boxes_df[
    image_study_patient_boxes_df[["boxes", "Rows", "Columns"]].apply(valid_boxes, axis=1)]

In [None]:
# Plot the image sizes

fig, ax = plt.subplots(1, 1, figsize=(15, 15))
ax.grid(True)

size_df = dicom_df.loc[:, ["Rows", "Columns"]].astype({"Rows": "int", "Columns": "int"})

sns.scatterplot(x="Rows", y="Columns", palette="pastel", data=size_df, ax=ax)

In [None]:
# Function to display images for a patient

def display_patient_images(patient_id, study_ids, image_boxes, study_determ):
    
    image_paths = []
    for study_id in study_ids:
        image_paths += glob.glob(os.path.join(TRAIN_PATH, f"{study_id}/*/*"))
        
    p = re.compile(os.path.join(TRAIN_PATH, "([^/]*)/([^/]*)/([^/]*)"))

    image_data = []
    for ip in image_paths:
        m = p.match(ip)
        study_id = m.group(1)
        series_id = m.group(2)
        image_id = m.group(3).split('.')[0]
        image_data.append((study_id, series_id, image_id, ip, image_boxes[image_id], study_determ[study_id]))

    num_images = len(image_data)
    nrows = int(math.ceil(num_images/2))
    fig, axs = plt.subplots(nrows, 2, figsize=(15, nrows*8), squeeze=False)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    i = 0
    for (study_id, series_id, image_id, image_path, boxes, determ) in image_data:
        dicom_dataset = pydicom.dcmread(image_path)
        image = apply_voi_lut(dicom_dataset.pixel_array, dicom_dataset)
        pr = dicom_dataset.data_element('PhotometricInterpretation').value
        if pr == "MONOCHROME1":
            image = 255.0 - image
        image = image - np.min(image)
        image = (image/np.max(image)) * 255.0
        image = image.astype('uint8')
        
        rows = dicom_dataset.data_element('Rows').value
        cols = dicom_dataset.data_element('Columns').value
        
        cl1 = clahe.apply(image)

        title = f"Patient {patient_id}\nStudy {study_id}\nDetermination {determ}\n" + \
                f"Study Date:{dicom_dataset.data_element('StudyDate').value}"
        ax = axs[int((i/2)), i%2]
        ax.set_title(title)
        imshow(cl1, cmap='gray', ax=ax)
        for (x, y, w, h) in boxes:
            ax.add_patch(patches.Rectangle(xy=(x, y), width=w, height=h, linewidth=1, color='red', fill=False))
        ax.add_patch(patches.Rectangle(xy=(0, 0), width=cols, height=rows, linewidth=4, color='blue', fill=False))
        i += 1
    if (num_images % 2 != 0):
        axs[int((i - 1)/2), 1].set_visible(False)
            
    fig.tight_layout()


In [None]:
# Explore patient image sets. Rerun cell to sample patients.

patient_id = image_study_patient_df.iloc[random.randrange(len(image_study_patient_df))]["Patient ID"]
print(f"Patient ID = {patient_id}")

study_image_boxes = image_study_patient_df[
            image_study_patient_df["Patient ID"] == patient_id][["study_id", "image_id", "determ", "boxes"]]

study_ids = study_image_boxes["study_id"].unique()
image_boxes = [(r[1]["image_id"], r[1]["boxes"]) for r in study_image_boxes.iterrows()]
image_boxes = dict(image_boxes)
study_determ = [(r[1]["study_id"], r[1]["determ"]) for r in study_image_boxes.iterrows()]
study_determ = dict(study_determ)


print(f"Number of studies: {len(study_ids)}")
print(f"Number of images: {len(image_boxes)}")
    
display_patient_images(patient_id, study_ids, image_boxes, study_determ)

In [None]:
# Get the studies with multiple images

uniq_df = image_study_patient_df.groupby(["Patient ID", "study_id"])[["image_id"]].nunique().reset_index()
multi_image_studies_df = uniq_df[uniq_df["image_id"] > 1]
multi_image_studies_df.head()

In [None]:
# Explore studies that have multiple images. Rerun the cell to sample studies with multiple images.

(patient_id, study_id) = multi_image_studies_df.iloc[random.randrange(len(multi_image_studies_df))][["Patient ID", "study_id"]]
print(f"Patient ID = {patient_id}")
print(f"study_id = {study_id}")

study_image_boxes = image_study_patient_df[(image_study_patient_df["Patient ID"] == patient_id) &
            (image_study_patient_df["study_id"] == study_id)][["study_id", "image_id", "determ", "boxes"]]

study_ids = study_image_boxes["study_id"].unique()
image_boxes = [(r[1]["image_id"], r[1]["boxes"]) for r in study_image_boxes.iterrows()]
image_boxes = dict(image_boxes)
study_determ = [(r[1]["study_id"], r[1]["determ"]) for r in study_image_boxes.iterrows()]
study_determ = dict(study_determ)


print(f"Number of studies: {len(study_ids)}")
print(f"Number of images: {len(image_boxes)}")
    
display_patient_images(patient_id, study_ids, image_boxes, study_determ)

In [None]:
# Determine whether studies have the expected number of images with boxes.

study_image_count_df = image_study_df[["study_id", "determ", "image_id", "box_count"]]
study_boxes = defaultdict(list)
study_determ = {}
for r in study_image_count_df.iterrows():
    (study_id, determ, image_id, box_count) = (r[1]["study_id"], r[1]["determ"], r[1]["image_id"], r[1]["box_count"])
    study_boxes[study_id].append((image_id, box_count))
    study_determ[study_id] = determ
study_box_count = {}
for study_id in study_boxes.keys():
    nonzero_count = sum(int(bc > 0) for (_, bc) in study_boxes[study_id])
    image_count = len(study_boxes[study_id])
    study_box_count[study_id] = (image_count, nonzero_count)

negpnu = []
negpnu_nonzero_boxes = []

tia = []
tia_one_img = []
tia_one_img_no_boxes = []

tia_mult_img = []
tia_mult_img_no_boxes = []
tia_mult_img_one_with_boxes = []
tia_mult_img_mult_with_boxes = []
tia_mult_img_all_with_boxes = []

for study_id in study_box_count.keys():
    if (study_determ[study_id] == "NegPnu"):
        negpnu.append((study_id, study_box_count[study_id]))
        if (study_box_count[study_id][1] > 0):
            negpnu_nonzero_boxes.append((study_id, study_box_count[study_id]))
    else:
        tia.append((study_id, study_box_count[study_id]))
        if (study_box_count[study_id][0] > 1):
            tia_mult_img.append((study_id, study_box_count[study_id]))
            if (study_box_count[study_id][1] == 0):
                tia_mult_img_no_boxes.append((study_id, study_box_count[study_id]))
            elif (study_box_count[study_id][1] == 1):
                tia_mult_img_one_with_boxes.append((study_id, study_box_count[study_id]))
            else:
                tia_mult_img_mult_with_boxes.append((study_id, study_box_count[study_id]))
            if (study_box_count[study_id][0] == study_box_count[study_id][1]):
                tia_mult_img_all_with_boxes.append((study_id, study_box_count[study_id]))
        else:
            tia_one_img.append((study_id, study_box_count[study_id]))
            if (study_box_count[study_id][1] == 0):
                tia_one_img_no_boxes.append((study_id, study_box_count[study_id]))

print(f"There are {len(study_boxes.keys())} studies total (in the metadata)")
print()
print(f"There are {len(negpnu)} NegPnu studies. Of these:")
print(f"    {len(negpnu_nonzero_boxes)} have images with boxes")
print()
print(f"There are {len(tia)} Typ, Indeterm, or ATyp studies. Of these:")
print(f"    {len(tia_one_img)} Type, Indeterm, or AType studies have exactly one image. Of these:")
print(f"        {len(tia_one_img_no_boxes)} have no boxes in the image")
print()
print(f"    {len(tia_mult_img)} Typ, Indeterm, or ATyp studies have multiple images. Of these:")
print(f"        {len(tia_mult_img_no_boxes)} have no boxes in any image")
print(f"        {len(tia_mult_img_one_with_boxes)} have exactly one image with boxes")
print(f"        {len(tia_mult_img_mult_with_boxes)} have more than one image with boxes")
print(f"        {len(tia_mult_img_all_with_boxes)} have all images with boxes")


In [None]:
# We're going to explore possible image duplication using image hashes.
# Create an image hash dataframe.

clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
image_hashes = []

for row in dicom_df[["study_id", "series_id", "image_id", "Patient ID"]].iterrows():
    (study_id, series_id, image_id, patient_id) = (row[1][:])
    image_path = os.path.join(TRAIN_PATH, f"{study_id}/{series_id}/{image_id}.dcm")
    dicom_dataset = pydicom.dcmread(image_path)
    image = apply_voi_lut(dicom_dataset.pixel_array, dicom_dataset)
    pi = dicom_dataset.data_element('PhotometricInterpretation').value
    if pi == "MONOCHROME1":
        image = 255.0 - image
    image = image - np.min(image)
    image = (image/np.max(image)) * 255.0
    image = image.astype('uint8')
    image = clahe.apply(image)
    image_hash = imagehash.phash(im.fromarray(image))
    row = {"patient_id":patient_id, "study_id":study_id, "series_id":series_id, "image_id":image_id, "image_hash":image_hash}
    image_hashes.append(row)

image_hashes_df = pd.DataFrame(image_hashes, columns=["patient_id", "study_id", "series_id", "image_id", "image_hash"])
image_hashes_df.head()

In [None]:
# How many patients have duplicate images.

image_count_df = image_hashes_df.groupby(["patient_id"])[["image_id"]].nunique().reset_index().rename(columns={"image_id": "image_count"})
multi_image_df = image_count_df[image_count_df["image_count"] > 1]
print(f"{len(multi_image_df)} patients have multiple images")

hash_count_df = image_hashes_df.groupby(["patient_id"])[["image_hash"]].nunique().reset_index().rename(columns={"image_hash": "image_hash_count"})

multi_image_hash_df = multi_image_df.merge(hash_count_df, on="patient_id")
dupl_image_df = multi_image_hash_df[multi_image_hash_df["image_hash_count"] < multi_image_hash_df["image_count"]]

print(f"{len(dupl_image_df)} patients have duplicate images")
dupl_image_df.head()

In [None]:
# See whether any images are duplicated across patients or studies.

image_hashes_df['image_hash'] = image_hashes_df['image_hash'].astype(str)
patients_per_hash_df = \
    image_hashes_df.groupby(["image_hash"])[["patient_id"]].nunique()\
        .reset_index().rename(columns={"patient_id": "patient_count"})

print(f"{len(patients_per_hash_df[patients_per_hash_df['patient_count'] > 1])} image hashes appear in multiple patients")
print()

studies_per_hash_df = \
    image_hashes_df.groupby(["image_hash"])[["study_id"]].nunique()\
        .reset_index().rename(columns={"study_id": "study_count"})

print(f"{len(studies_per_hash_df[studies_per_hash_df['study_count'] > 1])} image hashes appear in multiple studies")
studies_per_hash_df[studies_per_hash_df['study_count'] > 1]