# <p style='text-align: center'>Data from multiple datasets need to be normalized and aggregated into a single train/test set. Metadata will be collected about the train/test data in a new .csv spreadsheet</p><hr>

In [1]:
import os
import pandas as pd
import openpyxl as op
import numpy as np
import math
import shutil
import cv2
# Silence TensorFlow from complaining about my NUMA nodes not being readable
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import tensorflow as tf
from tensorflow import keras
import imblearn
import math
import filecmp

In [2]:
# Actualmed path
actmed_image_path = './dataset/Actualmed-COVID-chestxray-dataset/images/'
actmed_meta_path = './dataset/Actualmed-COVID-chestxray-dataset/metadata.csv'
# Sirm path
sirm_image_covid_path = './dataset/COVID-19-Radiography-Database/COVID/'
sirm_image_normal_path = './dataset/COVID-19-Radiography-Database/NORMAL/'
sirm_image_viral_path = './dataset/COVID-19-Radiography-Database/Viral-Pneumonia/'
sirm_meta_covid_path = './dataset/COVID-19-Radiography-Database/COVID.metadata.xlsx'
sirm_meta_normal_path = './dataset/COVID-19-Radiography-Database/NORMAL.metadata.xlsx'
sirm_meta_viral_path = './dataset/COVID-19-Radiography-Database/Viral-Pneumonia.metadata.xlsx'
# covid-chestxray-dataset path
cohen_image_path = './dataset/covid-chestxray-dataset/images/'
cohen_meta_path = './dataset/covid-chestxray-dataset/metadata.csv'
# Figure1 path
fig1_image_path = './dataset/Figure1-COVID-chestxray-dataset/images/'
fig1_meta_path = './dataset/Figure1-COVID-chestxray-dataset/metadata.csv'
# ricord path
ricord_image_path = './dataset/ricord/images/'
ricord_meta_path = './dataset/ricord/ricord_meta.csv'

# rsna path
rsna_image_path = './dataset/rsna/images/'
rsna_meta_path = './dataset/rsna/rsna_meta.csv'

# aggregated dataset output paths
dataset_out_train_path = './dataset/train/'
dataset_out_validate_path = './dataset/validate/'
dataset_out_test_path = './dataset/test/'

In [3]:
# recordkeeping for final dataset
train = []
validate = []
test = []
data_count = {'NORMAL': 0, 'PNEUMONIA': 0, 'COVID-19': 0}
dataset = {'actmed': [], 'sirm': [], 'cohen': [], 'fig1': [], 'ricord': [], 'rsna': []}
patient_list = []

results = dict()
results['COVID-19'] = 'COVID-19'
results['Pneumonia'] = 'PNEUMONIA'
results['SARS'] = 'PNEUMONIA'
results['Pneumocystis'] = 'PNEUMONIA'
results['Streptococcus'] = 'PNEUMONIA'
results['Chlamydophila'] = 'PNEUMONIA'
results['E.Coli'] = 'PNEUMONIA'
results['Klebsiella'] = 'PNEUMONIA'
results['Legionella'] = 'PNEUMONIA'
results['Lipoid'] = 'PNEUMONIA'
results['Varicella'] = 'PNEUMONIA'
results['Bacterial'] = 'PNEUMONIA'
results['Mycoplasma'] = 'PNEUMONIA'
results['Influenza'] = 'PNEUMONIA'
results['Tuberculosis'] = 'PNEUMONIA'
results['H1N1'] = 'PNEUMONIA'
results['Aspergillosis'] = 'PNEUMONIA'
results['Herpes'] = 'PNEUMONIA'
results['Aspiration'] = 'PNEUMONIA'
results['Nocardia'] = 'PNEUMONIA'
results['MERS-CoV'] = 'PNEUMONIA'
results['MRSA'] = 'PNEUMONIA'
results['No Finding'] = 'NORMAL'
results['No finding'] = 'NORMAL'
results['Normal'] = 'NORMAL'

Next, go through each dataset explicitly, log patientid, finding, and filename per each dataset.

In [4]:
# ActualMed
# Note: can only take Covid positive cases as this set characterizes any other non-normal case as no finding
actmed_meta = pd.read_csv(actmed_meta_path)
new_entries = 0
for index, series in actmed_meta.iterrows():
    if not series['finding'] == 'NaN' and series['finding'] == 'COVID-19' and series['patientid'] not in patient_list:
        dataset['actmed'].append([series['patientid'], series['finding'], os.path.join(actmed_image_path, series['imagename'])])
        patient_list.append(series['patientid'])
        data_count['COVID-19'] += 1
        new_entries += 1
print('ActualMed Added', new_entries, 'entries.')
print(data_count)

ActualMed Added 51 entries.
{'NORMAL': 0, 'PNEUMONIA': 0, 'COVID-19': 51}


In [5]:
# sirm
# normal
new_entries = 0
sirm_meta_normal = pd.read_excel(sirm_meta_normal_path)
for index, series in sirm_meta_normal.iterrows():
    if series['FILE NAME'] in patient_list:
        continue
    dataset['sirm'].append([series['FILE NAME'], 'NORMAL' , os.path.join(sirm_image_normal_path, series['FILE NAME'].split('-')[0] + ' (' + str(index + 1) + ').' + series['FORMAT'].lower())])
    data_count['NORMAL'] += 1
    new_entries += 1
sirm_meta_viral = pd.read_excel(sirm_meta_viral_path)
# pneumonia
for index, series in sirm_meta_viral.iterrows():
    if series['FILE NAME'] in patient_list:
        continue
    dataset['sirm'].append([series['FILE NAME'], 'PNEUMONIA' , os.path.join(sirm_image_viral_path, series['FILE NAME'].split('-')[0] + ' (' + str(index + 1) + ').' + series['FORMAT'].lower())])
    data_count['PNEUMONIA'] += 1
    new_entries += 1
sirm_meta_covid = pd.read_excel(sirm_meta_covid_path)
# covid
for index, series in sirm_meta_covid.iterrows():
    if series['FILE NAME'] in patient_list:
        continue
    dataset['sirm'].append([str(series['FILE NAME']), 'COVID-19' , os.path.join(sirm_image_covid_path, series['FILE NAME'].split()[0] + ' (' + str(index + 1) + ').' + series['FORMAT'].lower())])
    data_count['COVID-19'] += 1
    new_entries += 1
print('SIRM Added', new_entries, 'entries.')
print(data_count)

SIRM Added 3885 entries.
{'NORMAL': 1341, 'PNEUMONIA': 1345, 'COVID-19': 1250}


In [6]:
# cohen
# Note: only accepting AP and PA views
new_entries = 0
# We only want one instance of each patient to maintain fair weighting in the final dataset
included_patients = []
cohen_meta = pd.read_csv(cohen_meta_path)
for index, series in cohen_meta.iterrows():
    finding = series['finding'].split('/')[-1]
    if finding in results:
        if series['view'] == 'AP' or series['view'] == 'PA':
            # Slice last character off of patientid if it has a non digit character at the end
            if series['patientid'].isdigit():
                patient_id = series['patientid']
            else:
                patient_id = str(series['patientid'])[:-1]
            if not patient_id in included_patients and 'Cohen' + patient_id not in patient_list:   
                dataset['cohen'].append(['Cohen' + patient_id, results[finding], os.path.join(cohen_image_path, series['filename'])])
                data_count[results[finding]] += 1
                new_entries += 1
                included_patients.append(patient_id)
                
print('Cohen Added', new_entries, 'entries.')
print(data_count)

Cohen Added 339 entries.
{'NORMAL': 1355, 'PNEUMONIA': 1453, 'COVID-19': 1467}


In [7]:
# Figure1
new_entries = 0
# Will not bother filtering out images from the same patient as they were taken on different days and are not identical
fig1_meta = pd.read_csv(fig1_meta_path, encoding='ISO-8859-1')
for index, series in fig1_meta.iterrows():
    if series['finding'] in results and series['patientid'] not in included_patients and series['patientid'] not in patient_list:
        # Imagename not provided so we need to check if either the .jpg or .png exists first
        if os.path.exists(os.path.join(fig1_image_path, series['patientid'] + '.jpg')):
            dataset['fig1'].append([series['patientid'], results[series['finding']], os.path.join(fig1_image_path, series['patientid'] + '.jpg')])
        elif os.path.exists(os.path.join(fig1_image_path, series['patientid'] + '.png')):
            dataset['fig1'].append([series['patientid'], results[series['finding']], os.path.join(fig1_image_path, series['patientid'] + '.png')])
        data_count[results[series['finding']]] += 1
        new_entries += 1
print('Figure1 Added', new_entries, 'entries.')
print(data_count)

Figure1 Added 40 entries.
{'NORMAL': 1358, 'PNEUMONIA': 1455, 'COVID-19': 1502}


# <hr><p style='text-align: center'>Run 'Format Dicom Datasets.ipynb' before continuing!</p><hr>

In [8]:
# RICORD
new_entries = 0
# Meta file is created exactly how we need it so no field normalization required
ricord_meta = pd.read_csv(ricord_meta_path)
for index, series in ricord_meta.iterrows():
    if str(series['finding']) in results and series['patientid'] not in included_patients and series['patientid'] not in patient_list:
        dataset['ricord'].append([series['patientid'], results[series['finding']], os.path.join(ricord_image_path, series['imagename'])])
        data_count[results[series['finding']]] += 1
        new_entries += 1
print('RICORD Added', new_entries, 'entries.')
print(data_count)

RICORD Added 1130 entries.
{'NORMAL': 1358, 'PNEUMONIA': 1455, 'COVID-19': 2632}


In [9]:
# RSNA
new_entries = 0
# Meta file is created exactly how we need it so no field normalization required
ricord_meta = pd.read_csv(rsna_meta_path)
for index, series in ricord_meta.iterrows():
    if str(series['finding']) in results and series['patientid'] not in included_patients and series['patientid'] not in patient_list:
        dataset['ricord'].append([series['patientid'], results[series['finding']], os.path.join(rsna_image_path, series['imagename'])])
        data_count[results[series['finding']]] += 1
        new_entries += 1
print('RSNA Added', new_entries, 'entries.')
print(data_count)

RSNA Added 14863 entries.
{'NORMAL': 10209, 'PNEUMONIA': 7467, 'COVID-19': 2632}


# <p style='text-align: center'>Now create train-validate-test sets</p>

In [10]:
train_ratio = 0.7
validate_ratio = 0.2
#test_ratio = 0.1

# For reliable reproduction of results
#np.random.seed(0)

# Add all datasets to an aggregate set and shuffle order
aggregate_list = []

for items in dataset.values():
    for item in items:
        aggregate_list.append(item)

np.random.shuffle(aggregate_list)

normal_list = []
pneumonia_list = []
covid_list = []

for entry in aggregate_list:
    if entry[1] == 'NORMAL':
        normal_list.append(entry)
    elif entry[1] == 'PNEUMONIA':
        pneumonia_list.append(entry)
    else:
        covid_list.append(entry)

# Slice 70% of each list into the train set
train.extend(normal_list[:math.floor(train_ratio * len(normal_list))])
train.extend(pneumonia_list[:math.floor(train_ratio * len(pneumonia_list))])
train.extend(covid_list[:math.floor(train_ratio * len(covid_list))])

# Slice next 20% of each list into the validate set
validate.extend(normal_list[
    math.floor(train_ratio * len(normal_list)):
    math.floor(train_ratio * len(normal_list)) + math.floor(validate_ratio * len(normal_list))])
validate.extend(pneumonia_list[
    math.floor(train_ratio * len(pneumonia_list)):
    math.floor(train_ratio * len(pneumonia_list)) + math.floor(validate_ratio * len(pneumonia_list))])
validate.extend(covid_list[
    math.floor(train_ratio * len(covid_list)):
    math.floor(train_ratio * len(covid_list)) + math.floor(validate_ratio * len(covid_list))])

# Slice the remaining 10% into the test set
test.extend(normal_list[
    math.floor(train_ratio * len(normal_list)) + math.floor(validate_ratio * len(normal_list)):])
test.extend(pneumonia_list[
    math.floor(train_ratio * len(pneumonia_list)) + math.floor(validate_ratio * len(pneumonia_list)):])
test.extend(covid_list[
    math.floor(train_ratio * len(covid_list)) + math.floor(validate_ratio * len(covid_list)):])


print('   train: {}'.format(len(train)))
print('validate: {}'.format(len(validate)))
print('    test: {}'.format(len(test)))

   train: 14214
validate: 4060
    test: 2034


# <p style='text-align: center'>Finally copy the aggregate datasets into their respective directories</p>

In [11]:
# Ensure our output directories exist
os.makedirs(os.path.join(dataset_out_train_path,'images','NORMAL'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_train_path,'images','PNEUMONIA'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_train_path,'images','COVID-19'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_validate_path,'images','NORMAL'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_validate_path,'images','PNEUMONIA'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_validate_path,'images','COVID-19'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_test_path,'images','NORMAL'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_test_path,'images','PNEUMONIA'), exist_ok=True)
os.makedirs(os.path.join(dataset_out_test_path,'images','COVID-19'), exist_ok=True)

train_df = pd.DataFrame(columns=['patientid','finding','imagename'])
validate_df = pd.DataFrame(columns=['patientid','finding','imagename'])
test_df = pd.DataFrame(columns=['patientid','finding','imagename'])

# CLAHE Instance Definition
clahe = cv2.createCLAHE(clipLimit = 1.25)

for datapoint in train:
    # apply CLAHE to each image
    image = cv2.cvtColor(cv2.imread(datapoint[2]).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    clahe_image = clahe.apply(image)
    # Center crop to 90% original size to cut off arms/non-chest areas
    center = (clahe_image.shape[0] / 2, clahe_image.shape[1] / 2)
    w_crop = clahe_image.shape[0] * 0.9
    h_crop = clahe_image.shape[1] * 0.9
    clahe_image = clahe_image[math.floor(center[0] - w_crop / 2):math.floor(center[0] + w_crop / 2), math.floor(center[1] - h_crop / 2):math.floor(center[1] + h_crop / 2)]
    # Resize to (300, 300)
    clahe_image = cv2.resize(clahe_image, (300, 300), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(os.path.join(dataset_out_train_path,'images', datapoint[1], datapoint[0] + '.png'), clahe_image)
    # print(clahe_image.shape)
    # shutil.copy(os.path.join('./dataset/cache/', datapoint[0] + '.png'), os.path.join(dataset_out_train_path,'images', datapoint[1]))
    train_df = train_df.append({'patientid': datapoint[0], 'finding': datapoint[1], 'imagename': os.path.join(dataset_out_train_path, 'images', datapoint[1], datapoint[0] + '.png')}, ignore_index=True)
for datapoint in validate:
    # apply CLAHE to each image
    image = cv2.cvtColor(cv2.imread(datapoint[2]).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    clahe_image = clahe.apply(image)
    # Center crop to 90% original size to cut off arms/non-chest areas
    center = (clahe_image.shape[0] / 2, clahe_image.shape[1] / 2)
    w_crop = clahe_image.shape[0] * 0.9
    h_crop = clahe_image.shape[1] * 0.9
    clahe_image = clahe_image[math.floor(center[0] - w_crop / 2):math.floor(center[0] + w_crop / 2), math.floor(center[1] - h_crop / 2):math.floor(center[1] + h_crop / 2)]
    # Resize to (300, 300)
    clahe_image = cv2.resize(clahe_image, (300, 300), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(os.path.join(dataset_out_validate_path,'images', datapoint[1], datapoint[0] + '.png'), clahe_image)
    # print(clahe_image.shape)
    # shutil.copy(os.path.join('./dataset/cache/', datapoint[0] + '.png'), os.path.join(dataset_out_validate_path,'images', datapoint[1]))
    validate_df = validate_df.append({'patientid': datapoint[0], 'finding': datapoint[1], 'imagename': os.path.join(dataset_out_train_path, 'images', datapoint[1], datapoint[0] + '.png')}, ignore_index=True)
for datapoint in test:
    # apply CLAHE to each image
    image = cv2.cvtColor(cv2.imread(datapoint[2]).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    clahe_image = clahe.apply(image)
    # Center crop to 90% original size to cut off arms/non-chest areas
    center = (clahe_image.shape[0] / 2, clahe_image.shape[1] / 2)
    w_crop = clahe_image.shape[0] * 0.9
    h_crop = clahe_image.shape[1] * 0.9
    clahe_image = clahe_image[math.floor(center[0] - w_crop / 2):math.floor(center[0] + w_crop / 2), math.floor(center[1] - h_crop / 2):math.floor(center[1] + h_crop / 2)]
    # Resize to (300, 300)
    clahe_image = cv2.resize(clahe_image, (300, 300), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(os.path.join(dataset_out_test_path,'images', datapoint[1], datapoint[0] + '.png'), clahe_image)
    # print(clahe_image.shape)
    # shutil.copy(os.path.join('./dataset/cache/', datapoint[0] + '.png'), os.path.join(dataset_out_test_path,'images', datapoint[1]))
    test_df = test_df.append({'patientid': datapoint[0], 'finding': datapoint[1], 'imagename': os.path.join(dataset_out_train_path, 'images', datapoint[1], datapoint[0] + '.png')}, ignore_index=True)

train_df.to_csv(os.path.join(dataset_out_train_path,'metadata.csv'), index=False)
validate_df.to_csv(os.path.join(dataset_out_validate_path,'metadata.csv'), index=False)
test_df.to_csv(os.path.join(dataset_out_test_path,'metadata.csv'), index=False)