# II. PREPROCESSING
- identify and remove duplicate images from train & test sets
- convert remaining images to smaller dimensions (256 & 512 versions) for quicker training.
- save the images in jpg format for feeding into experiments.

# Summary of Findings
- 181 pairs of duplicate images exist.
- 74 groups containing the same image exist.
- The bounding boxes for the same or similar image are not consistent.

# Imports

In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
import cv2
import datetime
import gc
import glob
import imagehash
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import PIL
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import sys
import tqdm

## Parameters

In [None]:
base_path = '../input/siim-covid19-detection'

# Utility Functions

In [None]:
def read_dicom_image(image_file, voi_lut=True, fix_monochrome=True):
    """
    Reads a dicom image from a file an returns a numpy array.
    References: https://www.kaggle.com/trungthanhnguyen0502/eda-vinbigdata-chest-x-ray-abnormalities
    Args:
        image_file:
        voi_lut:
        fix_monochrome:

    Returns:

    """
    dicom = pydicom.read_file(image_file)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data


def string2boxes(string):
    strings = string.split()
    if strings[0].lower() == 'none':
        return []
    else:
        return [{'class': strings[idx],
                 'conf': float(strings[idx+1]),
                 'x1': float(strings[idx+2]),
                 'y1': float(strings[idx+3]),
                 'x2': float(strings[idx+4]),
                 'y2': float(strings[idx+5]),
                 } for idx in range(0, len(strings), 6)]


def plot_image(image, boxes=None, size=(5,5), title=None, columns=4):
    def plot_img(image, boxes=None, title=None):
        if isinstance(image, str):
            image_id = os.path.splitext(os.path.split(image)[1])[0]
            df = df_image.loc[df_image['id'] == image_id + '_image']
            boxes = string2boxes(df['label'].iloc[0]) if len(df) > 0 else None
            image = read_dicom_image(image)
        image = np.stack([image] * 3, axis=-1)
        if boxes is not None:
            for box in boxes:
                image = cv2.rectangle(image, (int(box['x1']), int(box['y1'])), (int(box['x2']), int(box['y2'])), [0, 255, 0], 10)
        plt.axis('on')
        plt.imshow(image, cmap='gray')
        if title is not None:
            plt.title(title)

    plt.figure(figsize=size)
    if isinstance(image, list):
        num = len(image)
        columns = min(columns, num)
        rows = math.ceil(num / columns)

        for index, single_image in enumerate(image):
            plt.subplot(rows, columns, index + 1)
            plot_img(single_image, boxes=boxes, title=title[index])
    else:
        plot_img(image, boxes=boxes, title=title)
    plt.show()


def images_find_duplicates(image_files, threshold=0.9):
    """
    Function to find duplicates in images.
    References: https://www.kaggle.com/appian/let-s-find-out-duplicate-images-with-imagehash
    Args:
        image_files:
        threshold:

    Returns:

    """
    funcs = [imagehash.average_hash, imagehash.phash, imagehash.dhash, imagehash.whash]
    image_ids = image_files
    hashes = []
    for file in tqdm.tqdm(image_files):
        image = PIL.Image.fromarray(read_dicom_image(file))
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))
    hashes_all = np.array(hashes)

    # Comparisons without Pytorch
    sim_list = []
    for i in tqdm.tqdm(range(hashes_all.shape[0])):
        sim_list.append(np.sum(hashes_all[i] == hashes_all, axis=1)/256)

    # nxn-matrix of similarities (n = # of images), upper triangular matrix
    similarities = np.triu(np.array(sim_list), 1)

    idx_pair = np.where(similarities > threshold)
    df_pairs = pd.DataFrame({'image1': [image_ids[i] for i in list(idx_pair[0])],
                             'image2': [image_ids[i] for i in list(idx_pair[1])],
                             'similarity': [similarities[i1, i2] for i1, i2 in zip(idx_pair[0], idx_pair[1])]})

    idx_group = np.zeros(len(image_files))
    group_id = 1
    for i1, i2 in zip(idx_pair[0], idx_pair[1]):
        if idx_group[i1] == 0 and idx_group[i2] == 0:
            idx_group[i1] = group_id
            idx_group[i2] = group_id
            group_id += 1
        elif idx_group[i1] != 0 and idx_group[i2] == 0:
            idx_group[i2] = idx_group[i1]
        elif idx_group[i1] == 0 and idx_group[i2] != 0:
            idx_group[i1] = idx_group[i2]
        elif idx_group[i1] != 0 and idx_group[i2] != 0 and idx_group[i1] != idx_group[i2]:
            common_id = min(idx_group[i1], idx_group[i2])
            idx_group[idx_group == idx_group[i1]] = common_id
            idx_group[idx_group == idx_group[i2]] = common_id

    group_list = []
    for i in range(1, group_id + 1):
        group_ids = list(np.where(idx_group == i)[0])
        if len(group_ids) > 0:
            group_list.append([image_ids[j] for j in group_ids])

    return df_pairs, group_list


def print_group_info(i, df_group):
    print(f'\nGroup {i+1}')
    print(f'Number of unique studies:       {len(df_group["study_id"].unique())}')
    print(f'Number of unique study labels:  {len(df_group["study_label"].unique())}\n')
    print(df_group[['image_id', 'num_boxes', 'study_id', 'study_label']])
    plot_image(list(df_group['image_file']), size=(20, 10), title=list(df_group['image_id']), columns=8)    


# Find Duplicates
Read files and search for duplicates.

In [None]:
train_files = sorted(glob.glob(os.path.join(base_path, 'train/*/*/*.dcm')))
print(f'Number of training files: {len(train_files)}')
df_image = pd.read_csv(os.path.join(base_path, 'train_image_level.csv'))
df_study = pd.read_csv(os.path.join(base_path, 'train_study_level.csv'))
df_study['study_label'] = df_study.apply(lambda r: ', '.join([df_study.columns[i] for i in range(1, 5) if r[i] > 0]), axis=1)

df_pairs, group_list = images_find_duplicates(train_files, threshold=0.95)
print(f'\nNumber of duplicate pairs: {len(df_pairs)}')
print(f'Number of duplicate groups: {len(group_list)}')

In [None]:
df_group_list = []
df_pairs.to_csv('pairs.csv')

with open('duplicates.csv', 'w') as text_file:
    for i, group in enumerate(group_list):
        group_ids = [os.path.splitext(os.path.basename(file))[0] + '_image' for file in group]
        df_group_ids = pd.DataFrame({'id': group_ids, 'image_file': group})
        df_group = df_group_ids.merge(df_image, on='id').sort_values('id')
        df_group['study_id'] = df_group['StudyInstanceUID'] + '_study'
        df_group['num_boxes'] = df_group.apply(lambda r: len(string2boxes(r['label'])), axis=1)
        df_group = df_group.merge(df_study, left_on='study_id', right_on='id')
        df_group = df_group.rename(columns={'id_x': 'image_id'})
        df_group_list.append(df_group)
        text_file.write(','.join(group_ids) + '\n')

# Duplicate Images with Bounding Boxes in More than 1 Image

In [None]:
for index, df_group in enumerate(df_group_list):
    if len(df_group[df_group['num_boxes'] > 0]) > 1:
        print_group_info(index, df_group)

# Duplicate Images with More Than 1 Study Label

In [None]:
for index, df_group in enumerate(df_group_list):
    if len(df_group['study_label'].unique()) > 1:
        print_group_info(index, df_group)

# Duplicate Images from More than 1 Study

In [None]:
with open('duplicates_study.csv', 'w') as text_file:
    for index, df_group in enumerate(df_group_list):
        if len(df_group['study_id'].unique()) > 1:
            print_group_info(index, df_group)
            text_file.write(','.join(list(df_group['study_id'].unique())) + '\n')

# Full List of Duplicate Images

In [None]:
for index, df_group in enumerate(df_group_list):
    print_group_info(index, df_group)

In [None]:
# get list of duplicate images for dropping
images_to_drop=set()
images_to_keep=set()
for index1, df_group in enumerate(df_group_list):
    for index2, row in df_group.iterrows():
        if(index2==0):
            images_to_keep.add(row['image_id'])
            continue
        images_to_drop.add(row['image_id'])

#print("drop: ", len(images_to_drop))
#print("keep: ", len(images_to_keep))

In [None]:
import pandas as pd
dups_df = pd.read_csv('../input/duplicates-1/duplicates_1.csv')
dups_df.head(1)

In [None]:
images_to_drop