### Origin
This notebook is adapted from the [Covid-Net](https://github.com/lindawangg/COVID-Net/blob/master/create_COVIDx.ipynb) code base. Some noteworthy changes are:
1. Unlike the original version, this notebook expects conversion from DICOM to PNG to be performed externally.
2. Instead of copying images, we create hardlinks to the original images to save disk space.

### History
<table>
<tr><td>2020/07/17</td><td>Added local images</td></tr>
<tr><td>2020/08/23</td><td>Added RICORD images + extra images from ieee8023</td></tr>
</table>

### Data sources

Compiled by Covid-Net team:
- [ieee8023](https://github.com/ieee8023/covid-chestxray-dataset)
- [Figure1](https://github.com/agchung/Figure1-COVID-chestxray-dataset)
- [Actualmed](https://github.com/agchung/Actualmed-COVID-chestxray-dataset)
- [SIRM](https://www.kaggle.com/tawsifurrahman/covid19-radiography-database)
- [RSNA Pneumonia detection challenge](https://www.kaggle.com/c/rsna-pneumonia-detection-challenge)

Additional
- [RICORD](https://www.rsna.org/en/covid-19/COVID-19-RICORD), [TCIA](https://public.cancerimagingarchive.net/ncia/home.jsf)
- Local non-public images (excluded from this notebook)

In [1]:
%env SAVE_PATH=/data/datasets/extended_v2

env: SAVE_PATH=/data/datasets/extended_v2


In [2]:
import numpy as np
import pandas as pd
import os, sys
import random 
# from shutil import copyfile
import pydicom as dicom
import cv2
import re
from sklearn.model_selection import train_test_split

In [3]:
# set parameters here
savepath = os.environ['SAVE_PATH']
if os.path.exists(savepath):
    print(f"The dataset {savepath} exists! Please delete it if you no longer need it. Not proceeding.")
    sys.exit(0)
    
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]
data_path = "/data/data"

print(f"Saving dataset in {savepath}")

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
cohen_imgpath = f'{data_path}/covid-chestxray-dataset/images' 
cohen_csvpath = f'{data_path}/covid-chestxray-dataset/metadata.csv'

# path to covid-19 dataset from https://github.com/agchung/Figure1-COVID-chestxray-dataset
fig1_imgpath = f'{data_path}/Figure1-COVID-chestxray-dataset/images'
fig1_csvpath = f'{data_path}/Figure1-COVID-chestxray-dataset/metadata.csv'

# path to covid-19 dataset from https://github.com/agchung/Actualmed-COVID-chestxray-dataset
actmed_imgpath = f'{data_path}/Actualmed-COVID-chestxray-dataset/images'
actmed_csvpath = f'{data_path}/Actualmed-COVID-chestxray-dataset/metadata.csv'

# path to covid-19 dataset from https://www.kaggle.com/tawsifurrahman/covid19-radiography-database
sirm_imgpath = f'{data_path}/converted/sirm'
sirm_csvpath = f'{data_path}/COVID-19 Radiography Database/COVID-19.metadata.xlsx'

# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
rsna_datapath = f'{data_path}/rsna-pneumonia-detection-challenge'
# get all the normal from here
rsna_csvname = f'stage_2_detailed_class_info.csv' 
# get all the 1s from here since 1 indicate pneumonia
# found that images that aren't pneunmonia and also not normal are classified as 0s
rsna_csvname2 = f'stage_2_train_labels.csv' 
rsna_imgpath = f'{data_path}/converted/rsna'

# parameters for COVIDx dataset
train = []
test = []
evaluate = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
evaluate_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Klebsiella'] = 'pneumonia'
mapping['Chlamydophila'] = 'pneumonia'
mapping['Legionella'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# train/test split
split = 0.1

# to avoid duplicates
patient_imgpath = {}

os.makedirs(os.path.join(savepath, 'test'), exist_ok=False)
os.makedirs(os.path.join(savepath, 'train'), exist_ok=False)
os.makedirs(os.path.join(savepath, 'evaluate'), exist_ok=False)

Saving dataset in /data/datasets/extended_v2


In [4]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
cohen_csv = pd.read_csv(cohen_csvpath, nrows=None)
#idx_pa = csv["view"] == "PA"  # Keep only the PA view
views = ["PA", "AP", "AP Supine", "AP semi erect", "AP erect"]
cohen_idx_keep = cohen_csv.view.isin(views)
cohen_csv = cohen_csv[cohen_idx_keep]

fig1_csv = pd.read_csv(fig1_csvpath, encoding='ISO-8859-1', nrows=None)
actmed_csv = pd.read_csv(actmed_csvpath, nrows=None)

sirm_csv = pd.read_excel(sirm_csvpath)

In [5]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset, figure1 and actualmed
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
covid_ds = {'cohen': [], 'fig1': [], 'actmed': [], 'sirm': []}

for index, row in cohen_csv.iterrows():
    f = row['finding'].split(',')[0] # take the first finding, for the case of COVID-19, ARDS
    if f in mapping: # 
        count[mapping[f]] += 1
        entry = [str(row['patientid']), row['filename'], mapping[f], 'cohen']
        filename_label[mapping[f]].append(entry)
        if mapping[f] == 'COVID-19':
            covid_ds['cohen'].append(str(row['patientid']))
        
for index, row in fig1_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0] # take the first finding
        if f in mapping: # 
            count[mapping[f]] += 1
            if os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.jpg')):
                entry = [row['patientid'], row['patientid'] + '.jpg', mapping[f], 'fig1']
            elif os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.png')):
                entry = [row['patientid'], row['patientid'] + '.png', mapping[f], 'fig1']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['fig1'].append(row['patientid'])

for index, row in actmed_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0]
        if f in mapping:
            count[mapping[f]] += 1
            entry = [row['patientid'], row['imagename'], mapping[f], 'actmed']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['actmed'].append(row['patientid'])
    
sirm = set(sirm_csv['URL'])
cohen = set(cohen_csv['url'])
discard = ['100', '101', '102', '103', '104', '105', 
           '110', '111', '112', '113', '122', '123', 
           '124', '125', '126', '217']

for idx, row in sirm_csv.iterrows():
    patientid = row['FILE NAME']
    if row['URL'] not in cohen and patientid[patientid.find('(')+1:patientid.find(')')] not in discard:
        count[mapping['COVID-19']] += 1
        imagename = patientid + '.' + row['FORMAT'].lower()
        if not os.path.exists(os.path.join(sirm_imgpath, imagename)):
            imagename = patientid.split('(')[0] + ' ('+ patientid.split('(')[1] + '.' + row['FORMAT'].lower()
        entry = [patientid, imagename, mapping['COVID-19'], 'sirm']
        filename_label[mapping['COVID-19']].append(entry)
        covid_ds['sirm'].append(patientid)
    
print('Data distribution from covid datasets:')
print(count)

Data distribution from covid datasets:
{'normal': 0, 'pneumonia': 51, 'COVID-19': 607}


In [6]:
# add covid-chestxray-dataset, figure1 and actualmed into COVIDx dataset
# since these datasets don't have test dataset, split into train/test by patientid
# for covid-chestxray-dataset:
# patient 8 is used as non-COVID19 viral test
# patient 31 is used as bacterial test
# patients 19, 20, 36, 42, 86 are used as COVID-19 viral test
# for figure 1:
# patients 24, 25, 27, 29, 30, 32, 33, 36, 37, 38

ds_imgpath = {'cohen': cohen_imgpath, 'fig1': fig1_imgpath, 'actmed': actmed_imgpath, 'sirm': sirm_imgpath}

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
    # random.sample(list(arr[:,0]), num_test)
    if key == 'pneumonia':
        test_patients = ['8', '31']
    elif key == 'COVID-19':
        test_patients = ['19', '20', '36', '42', '86', 
                         '94', '97', '117', '132', 
                         '138', '144', '150', '163', '169', '174', '175', '179', '190', '191'
                         'COVID-00024', 'COVID-00025', 'COVID-00026', 'COVID-00027', 'COVID-00029',
                         'COVID-00030', 'COVID-00032', 'COVID-00033', 'COVID-00035', 'COVID-00036',
                         'COVID-00037', 'COVID-00038',
                         'ANON24', 'ANON45', 'ANON126', 'ANON106', 'ANON67',
                         'ANON153', 'ANON135', 'ANON44', 'ANON29', 'ANON201', 
                         'ANON191', 'ANON234', 'ANON110', 'ANON112', 'ANON73', 
                         'ANON220', 'ANON189', 'ANON30', 'ANON53', 'ANON46',
                         'ANON218', 'ANON240', 'ANON100', 'ANON237', 'ANON158',
                         'ANON174', 'ANON19', 'ANON195',
                         'COVID-19(119)', 'COVID-19(87)', 'COVID-19(70)', 'COVID-19(94)', 
                         'COVID-19(215)', 'COVID-19(77)', 'COVID-19(213)', 'COVID-19(81)', 
                         'COVID-19(216)', 'COVID-19(72)', 'COVID-19(106)', 'COVID-19(131)', 
                         'COVID-19(107)', 'COVID-19(116)', 'COVID-19(95)', 'COVID-19(214)', 
                         'COVID-19(129)']
    else: 
        test_patients = []
    print('Key: ', key)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        try:
            if patient[0] not in patient_imgpath:
                patient_imgpath[patient[0]] = [patient[1]]
            else:
                if patient[1] not in patient_imgpath[patient[0]]:
                    patient_imgpath[patient[0]].append(patient[1])
                else:
                    continue  # skip since image has already been written
            outfile = patient[1].replace(' ', '')
            
            if patient[0] in test_patients:
                os.link(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join(savepath, 'test', outfile))
                test.append(patient)
                test_count[patient[2]] += 1
            else:
                os.link(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join(savepath, 'train', outfile))
                train.append(patient)
                train_count[patient[2]] += 1
            
            patient[1] = outfile
        except Exception as e:
            print(e, file=sys.stderr)

print('test count: ', test_count)
print('train count: ', train_count)

Key:  pneumonia
Test patients:  ['8', '31']
Key:  COVID-19
Test patients:  ['19', '20', '36', '42', '86', '94', '97', '117', '132', '138', '144', '150', '163', '169', '174', '175', '179', '190', '191COVID-00024', 'COVID-00025', 'COVID-00026', 'COVID-00027', 'COVID-00029', 'COVID-00030', 'COVID-00032', 'COVID-00033', 'COVID-00035', 'COVID-00036', 'COVID-00037', 'COVID-00038', 'ANON24', 'ANON45', 'ANON126', 'ANON106', 'ANON67', 'ANON153', 'ANON135', 'ANON44', 'ANON29', 'ANON201', 'ANON191', 'ANON234', 'ANON110', 'ANON112', 'ANON73', 'ANON220', 'ANON189', 'ANON30', 'ANON53', 'ANON46', 'ANON218', 'ANON240', 'ANON100', 'ANON237', 'ANON158', 'ANON174', 'ANON19', 'ANON195', 'COVID-19(119)', 'COVID-19(87)', 'COVID-19(70)', 'COVID-19(94)', 'COVID-19(215)', 'COVID-19(77)', 'COVID-19(213)', 'COVID-19(81)', 'COVID-19(216)', 'COVID-19(72)', 'COVID-19(106)', 'COVID-19(131)', 'COVID-19(107)', 'COVID-19(116)', 'COVID-19(95)', 'COVID-19(214)', 'COVID-19(129)']
test count:  {'normal': 0, 'pneumonia': 5,

In [7]:
# add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
csv_normal = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

for key in patients.keys():
    arr = np.array(patients[key])
    if arr.size == 0:
        continue
    # split by patients 
    # num_diff_patients = len(np.unique(arr))
    # num_test = max(1, round(split*num_diff_patients))
    test_patients = np.load('rsna_test_patients_{}.npy'.format(key)) # random.sample(list(arr), num_test), download the .npy files from the repo.
    # np.save('rsna_test_patients_{}.npy'.format(key), np.array(test_patients))
    for irec, patient in enumerate(arr):
        if patient not in patient_imgpath:
            patient_imgpath[patient] = [patient]
        else:
            continue  # skip since image has already been written
                
        imgname = patient + '.png'
        infile = os.path.join(rsna_imgpath, imgname)
        if patient in test_patients:
            os.link(infile, os.path.join(savepath, 'test', imgname))
            test.append([patient, imgname, key, 'rsna'])
            test_count[key] += 1
        else:
            os.link(infile, os.path.join(savepath, 'train', imgname))
            train.append([patient, imgname, key, 'rsna'])
            train_count[key] += 1
        
        if irec % 100 == 0:
            sys.stdout.write(f"\r{irec} / {len(arr)}")
    print()

print('test count: ', test_count)
print('train count: ', train_count)

8800 / 8851
9200 / 9555
test count:  {'normal': 885, 'pneumonia': 594, 'COVID-19': 100}
train count:  {'normal': 7966, 'pneumonia': 5469, 'COVID-19': 507}


## [RICORD](https://www.rsna.org/en/covid-19/COVID-19-RICORD)

All images in this set are COVID-19 positive.

Source: https://public.cancerimagingarchive.net/ncia/home.jsf<br>
Search filters: Collection=COVID-19-AR, Image Modality=CR|DX

**Note**: The DICOM images were converted to PNG (using mogrify or convert) and the original folder structure was retained

In [8]:
ricord_imgpath = f'{data_path}/20200823/COVID-19-AR_PNG'
ricord_csvpath = f'{data_path}/20200823/COVID-19 AR Clinical Correlates July202020.xlsx'

ricord_df = pd.read_excel(ricord_csvpath, sheet_name="Imaging Studies", header=1)
allowed_studies = ['XR CHEST AP PORTABLE', 'XR CHEST AP ONLY',
       'XR CHEST PA AND LATERAL', 'XR CHEST PA ONLY', 'XR ACUTE ABDOMINAL SERIES W PA CHEST PORTABLE' ]

ricord_df = ricord_df[ ricord_df['Image Study Description'].isin(allowed_studies) ]

study_dir_regex = re.compile(r'\d\d\-\d\d\-\d{4}\-([A-Za-z ]+)\-\d{5}')
single_studies = ['XR CHEST AP PORTABLE', 'XR CHEST AP ONLY',  'XR CHEST PA ONLY' ]
multi_studies = {
    'XR CHEST PA AND LATERAL': [re.compile(r".*\-PA\-.*")],
    'XR ACUTE ABDOMINAL SERIES W PA CHEST PORTABLE': [re.compile(r".*\-Chest AP\-.*")]
}

pids = ricord_df["patient_id"].sort_values().unique()
# print(f"pids: {pids}")
p_train, p_eval = train_test_split(pids, test_size=0.2, random_state=2)
# print(f"p_train: {p_train}")
# print(f"p_eval: {p_eval}")

def ricord_add_patient(in_dir, pid, pcount, set_dir, set_count, set_rec, mapping_file):
    for (p, d, f) in os.walk(in_dir):
        if len(f) > 1:
            print(f"Too many images in {in_dir}", file=sys.stderr)
        
        for img in f:
            in_file = os.path.join(in_dir, img)
            out_id = f"{pid}_{pcount:02d}"
            out_file = f"{out_id}.png"
            out_path = os.path.join(savepath, set_dir, out_file)
            os.link(in_file, out_path)
            patient = [out_id, out_file, "COVID-19", "RICORD"]
            set_rec.append(patient)
            set_count[patient[2]] += 1
            print(f"{in_file}, {out_path}", file=mapping_file)
        
        break

with open(os.path.join(savepath, "ricord_mapping.csv"), "w") as mapping_file:
    for pid in pids:
        pcount = 1
        patient_path = os.path.join(ricord_imgpath, pid)
        
        if pid in p_train:
            set_dir, set_count, set_rec = ('train', train_count, train)
        else:
            set_dir, set_count, set_rec = ('evaluate', evaluate_count, evaluate)

        for (p1, d1, f1) in os.walk(patient_path):
            for study_name in sorted(d1):
                study_path = os.path.join(patient_path, study_name)
                m = study_dir_regex.match(study_name)
                if m:
                    img_study_name = m.group(1)
                    for (p2, d2, f2) in os.walk(study_path):
                        if img_study_name in single_studies:
                            for imgdir in sorted(d2):
                                in_dir = os.path.join(study_path, imgdir)
                                ricord_add_patient(in_dir, pid, pcount,
                                                   set_dir, set_count, set_rec,
                                                   mapping_file)
                                pcount += 1
                        else:
                            study_accept_patterns = multi_studies[img_study_name]
                            for imgdir in sorted(d2):
                                for ap in study_accept_patterns:
                                    if ap.match(imgdir):
                                        in_dir = os.path.join(study_path, imgdir)
                                        ricord_add_patient(in_dir, pid, pcount,
                                                           set_dir, set_count, set_rec,
                                                           mapping_file)
                                        pcount += 1
                                        break
                        break
                else:
                    print(f"ERROR: {study_path}")
            break

In [9]:
# final stats
print('Final stats')
print('Train count: ', train_count)
print('Test count: ', test_count)
print('Evaluate count: ', evaluate_count)
print('Total length of train: ', len(train))
print('Total length of test: ', len(test))
print('Total length of evaluate: ', len(evaluate))

Final stats
Train count:  {'normal': 7966, 'pneumonia': 5469, 'COVID-19': 711}
Test count:  {'normal': 885, 'pneumonia': 594, 'COVID-19': 100}
Evaluate count:  {'normal': 0, 'pneumonia': 0, 'COVID-19': 45}
Total length of train:  14146
Total length of test:  1579
Total length of evaluate:  45


In [10]:
# export to train and test csv
# format as patientid, filename, label, separated by a space
def save_samples(samples, filename):
    with open(filename, 'w') as sample_file:
        for sample in samples:
            if len(sample) == 4:
                info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + ' ' + sample[3] + '\n'
            else:
                info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
            sample_file.write(info)

save_samples(train, os.path.join(savepath, "train.txt"))
save_samples(test, os.path.join(savepath, "test.txt"))
save_samples(evaluate, os.path.join(savepath, "evaluate.txt"))

In [11]:
%%bash

ls -1 $SAVE_PATH/train | wc -l
wc -l $SAVE_PATH/train.txt
ls -1 $SAVE_PATH/test | wc -l
wc -l $SAVE_PATH/test.txt
ls -1 $SAVE_PATH/evaluate | wc -l
wc -l $SAVE_PATH/evaluate.txt

14146
14146 /data/datasets/extended_v2/train.txt
1579
1579 /data/datasets/extended_v2/test.txt
45
45 /data/datasets/extended_v2/evaluate.txt
