# Data colector for COVID-19 lung X-ray dataset 

Adapted from https://github.com/lindawangg/COVID-Net/blob/master/create_COVIDx_v2.ipynb

In [0]:
pip install pydicom



In [0]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2

In [0]:
# set parameters here
savepath = '../covid19-xrays/'

# path to covid-19 dataset 
imgpath = 'https://github.com/ieee8023/covid-chestxray-dataset/tree/master/images'  
csvpath = 'https://github.com/ieee8023/covid-chestxray-dataset/tree/master/metadata.csv'

# parameters for COVIDx dataset
mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'


{'COVID-19': 'COVID-19', 'SARS': 'pneumonia', 'MERS': 'pneumonia', 'Streptococcus': 'pneumonia', 'Normal': 'normal', 'Lung Opacity': 'pneumonia', '1': 'pneumonia'}


In [0]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
csv = pd.read_csv(csvpath,  engine="python")
idx_pa = csv["view"] == "PA"  # Keep only the PA view
csv = csv[idx_pa]

pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"]
pathologies = ["Pneumonia","Viral Pneumonia", "Bacterial Pneumonia", "No Finding"] + pneumonias
pathologies = sorted(pathologies)

print(pathologies)

['ARDS', 'Bacterial Pneumonia', 'COVID-19', 'MERS', 'No Finding', 'Pneumonia', 'SARS', 'Streptococcus', 'Viral Pneumonia']


In [0]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
for index, row in csv.iterrows():
    f = row['finding']
    if f in mapping:
        count[mapping[f]] += 1
        entry = [int(row['patientid']), row['filename'], mapping[f]]
        filename_label[mapping[f]].append(entry)

print('Data distribution from covid-chestxray-dataset:')
print(count)
print(filename_label)
print(filename_label['pneumonia'])

Data distribution from covid-chestxray-dataset:
{'normal': 0, 'pneumonia': 24, 'COVID-19': 103}
{'normal': [], 'pneumonia': [[3, 'SARS-10.1148rg.242035193-g04mr34g0-Fig8a-day0.jpeg', 'pneumonia'], [3, 'SARS-10.1148rg.242035193-g04mr34g0-Fig8b-day5.jpeg', 'pneumonia'], [3, 'SARS-10.1148rg.242035193-g04mr34g0-Fig8c-day10.jpeg', 'pneumonia'], [7, 'SARS-10.1148rg.242035193-g04mr34g04a-Fig4a-day7.jpeg', 'pneumonia'], [7, 'SARS-10.1148rg.242035193-g04mr34g04b-Fig4b-day12.jpeg', 'pneumonia'], [8, 'SARS-10.1148rg.242035193-g04mr34g05x-Fig5-day9.jpeg', 'pneumonia'], [9, 'SARS-10.1148rg.242035193-g04mr34g07a-Fig7a-day5.jpeg', 'pneumonia'], [9, 'SARS-10.1148rg.242035193-g04mr34g07b-Fig7b-day12.jpeg', 'pneumonia'], [10, 'SARS-10.1148rg.242035193-g04mr34g09a-Fig9a-day17.jpeg', 'pneumonia'], [10, 'SARS-10.1148rg.242035193-g04mr34g09b-Fig9b-day19.jpeg', 'pneumonia'], [10, 'SARS-10.1148rg.242035193-g04mr34g09c-Fig9c-day27.jpeg', 'pneumonia'], [29, 'streptococcus-pneumoniae-pneumonia-1.jpg', 'pneumonia

In [0]:
# add covid-chestxray-dataset into COVIDx dataset
# since covid-chestxray-dataset doesn't have test dataset
# split into train/test by patientid
# for COVIDx:
# patient 8 is used as non-COVID19 viral test
# patient 31 is used as bacterial test
# patients 19, 20, 36, 42, 86 are used as COVID-19 viral test
import requests

git_url = imgpath 

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
        
    if key == 'pneumonia':
        test_patients = ['8', '31']
    elif key == 'COVID-19':
        test_patients = ['19', '20', '36', '42', '86'] # random.sample(list(arr[:,0]), num_test)
    else: 
        test_patients = []

    # go through all the patients
    for patient in arr:
        if patient[0] in test_patients:
            pth = savepath + 'test/' + key + '/' + patient[1]
        else:
            pth = savepath + 'train/' + key + '/' + patient[1]

        response = requests.get(git_url + patient[1])

        if response.status_code == 200:
          with open(pth, 'wb') as f:
            f.write(response.content)

Key:  pneumonia
Test patients:  ['8', '31']
Key:  COVID-19
Test patients:  ['19', '20', '36', '42', '86']
