In [1]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2
import csv
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img , save_img

Using TensorFlow backend.


In [2]:
# set parameters here
savepath = 'data'
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
imgpath = 'covid-chestxray-dataset-master/images' 
csvpath = 'covid-chestxray-dataset-master/metadata.csv'

# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
kaggle_datapath = 'rsna-pneumonia-detection-challenge'
kaggle_csvname = 'stage_2_detailed_class_info.csv' # get all the normal from here
kaggle_csvname2 = 'stage_2_train_labels.csv' # get all the 1s from here since 1 indicate pneumonia
kaggle_imgpath = 'stage_2_train_images'

# parameters for COVIDx dataset
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# train/test split
split = 0.1

In [3]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
csv = pd.read_csv(csvpath, nrows=None)
idx_pa = csv["view"] == "PA"  # Keep only the PA view
csv = csv[idx_pa]

pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"]
pathologies = ["Pneumonia","Viral Pneumonia", "Bacterial Pneumonia", "No Finding"] + pneumonias
pathologies = sorted(pathologies)

In [4]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
for index, row in csv.iterrows():
    f = row['finding']
    if f in mapping:
        count[mapping[f]] += 1
        entry = [int(row['patientid']), row['filename'], mapping[f]]
        filename_label[mapping[f]].append(entry)

print('Data distribution from covid-chestxray-dataset:')
print(count)

Data distribution from covid-chestxray-dataset:
{'normal': 0, 'pneumonia': 29, 'COVID-19': 115}


In [6]:
# add covid-chestxray-dataset into COVIDx dataset
# since covid-chestxray-dataset doesn't have test dataset
# split into train/test randomly

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    if key == 'COVID-19':
        continue 
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients

    print('Key: ', key)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        if np.random.choice(True,False,p=[split,1-split]):
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test',key , patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
        else:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train',key , patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('test count: ', test_count)
print('train count: ', train_count)

Key:  pneumonia
Test patients:  ['8', '31']
Key:  COVID-19
Test patients:  ['19', '20', '36', '42', '86']
test count:  {'normal': 0, 'pneumonia': 5, 'COVID-19': 10}
train count:  {'normal': 0, 'pneumonia': 24, 'COVID-19': 105}


In [7]:
# data augmentation for COVID-19 (115x30= 3450)

augmentation_count = 30
print('convid-19 sample: 115')
print('after augmentation', 115*augmentation_count)
datagen = ImageDataGenerator( 
        rotation_range = 15, 
        shear_range = 0.1, 
        zoom_range = 0.1, 
        horizontal_flip = False, 
        brightness_range = (0.8, 1.2)) 

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0 or key != 'COVID-19':
        continue

    print('Key: ', key)
    #print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        save_prefix = patient[1]
        save_prefix = save_prefix.replace('.png', '')
        save_prefix = save_prefix.replace('.jpeg', '')
        save_prefix = save_prefix.replace('.jpg', '')
        
        
        test_path = '/test/COVID-19'
        #test.append(patient)
        #test_count[patient[2]] += 1

        train_path = '/train/COVID-19'
        #train.append(patient)
        #train_count[patient[2]] += 1

        img = load_img(imgpath +'/'+ patient[1])
        x = img_to_array(img) 
        x = x.reshape((1, ) + x.shape)
        for i, batch in enumerate(datagen.flow(x, batch_size = 1, save_to_dir =savepath+test_path,
                                            save_prefix =save_prefix, save_format ='jpeg')):
            test_count[patient[2]] += 1
            if i+1 == int(augmentation_count*split):
                break
        
        for i, batch in enumerate(datagen.flow(x, batch_size = 1, save_to_dir =savepath+train_path,
                                            save_prefix =save_prefix, save_format ='jpeg')): 
            train_count[patient[2]] += 1
            if i+1 == int(augmentation_count*(1-split)):
                break
        
print('test count: ', test_count)
print('train count: ', train_count)

Key:  COVID-19
test count:  {'normal': 0, 'pneumonia': 0, 'COVID-19': 401}
train count:  {'normal': 0, 'pneumonia': 0, 'COVID-19': 3475}


In [5]:
# add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
csv_normal = pd.read_csv(os.path.join(kaggle_datapath, kaggle_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(kaggle_datapath, kaggle_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

augmentation_count = 2
datagen = ImageDataGenerator( 
        rotation_range = 15, 
        shear_range = 0.1, 
        zoom_range = 0.1, 
        horizontal_flip = False, 
        brightness_range = (0.8, 1.2)) 

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

for key in patients.keys():
    arr = np.array(patients[key])
    if arr.size == 0: #or key=='normal':
        continue
    # split by patients 
    #num_diff_patients = len(np.unique(arr))
    #num_test = max(1, round(split*num_diff_patients))
    #print(key, num_test) 
    #test_patients = random.sample(list(np.unique(arr)), num_test) #, download the .npy files from the repo.
    #np.save('rsna_test_patients1_{}.npy'.format(key), np.array(test_patients))
    test_patients = np.load('rsna_test_patients1_{}.npy'.format(key))
    for patient in arr:
        ds = dicom.dcmread(os.path.join(kaggle_datapath, kaggle_imgpath, patient + '.dcm'))
        pixel_array_numpy = ds.pixel_array
        imgname = patient #+ '.png'
        x = pixel_array_numpy.reshape((1, ) + pixel_array_numpy.shape + (1,))
        if patient in test_patients:
            #cv2.imwrite(os.path.join(savepath, 'test',key , imgname), pixel_array_numpy)
            for i, batch in enumerate(datagen.flow(x, batch_size = 1, save_to_dir =savepath+'/test/'+key,
                                            save_prefix =imgname, save_format ='png')):
                if i+1 == augmentation_count:
                    break
            test.append([patient, imgname, key])
            test_count[key] += augmentation_count
        else:
            #cv2.imwrite(os.path.join(savepath, 'train',key , imgname), pixel_array_numpy)
            for i, batch in enumerate(datagen.flow(x, batch_size = 1, save_to_dir =savepath+'/train/'+key,
                                            save_prefix =imgname, save_format ='png')):
                if i+1 == augmentation_count:
                    break
            train.append([patient, imgname, key])
            train_count[key] += augmentation_count

print('test count: ', test_count)
print('train count: ', train_count)

test count:  {'normal': 0, 'pneumonia': 1912, 'COVID-19': 0}
train count:  {'normal': 0, 'pneumonia': 17198, 'COVID-19': 0}


In [8]:
# final stats
print('Final stats')
print('Train count: ', train_count)
print('Test count: ', test_count)
print('Total length of train: ', len(train))
print('Total length of test: ', len(test))

Final stats
Train count:  {'normal': 7966, 'pneumonia': 8623, 'COVID-19': 105}
Test count:  {'normal': 885, 'pneumonia': 961, 'COVID-19': 10}
Total length of train:  16694
Total length of test:  1856


In [None]:
# run this cell when adding in new covid data from covid-chextxray-dataset

# load in current train/test information
'''train_filepath = 'train_split_v2.txt'
test_filepath = 'test_split_v2.txt'
file = open(train_filepath, 'r') 
trainfiles = file.readlines() 
trainfiles = np.array([line.split() for line in trainfiles])
file = open(test_filepath, 'r')
testfiles = file.readlines()
testfiles = np.array([line.split() for line in testfiles])

# find the new entries in csv 
new_entries = []
for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    for patient in arr:
        if patient[1] not in trainfiles and patient[1] not in testfiles:
            # if key is normal, bacteria or viral add to train folder
            if key in ['normal', 'pneumonia']:
                copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
                train.append(patient)
                train_count[patient[2]] += 1
            else: 
                new_entries.append(patient)
new_entries = np.array(new_entries)

# 10% of new entries should go into in test
if new_entries.size > 0:
    num_diff_patients = len(np.unique(new_entries[:,0]))
    num_test = max(1, round(split*num_diff_patients))

    i = 0
    used_i = []
    # insert patients who are already in dataset into the respective train/test
    for patient in new_entries:
        if patient[0] in trainfiles:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1
            used_i.append(i)
        elif patient[0] in testfiles:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
            used_i.append(i)
        i += 1
    # delete patients who have already been inserted
    new_entries = np.delete(new_entries, used_i, axis=0)

    # select num_test number of random patients
    test_patients = random.sample(list(new_entries[:,0]), num_test)
    print('test patients: ', test_patients)
    # add to respective train/test folders
    for patient in new_entries:
        if patient[0] in test_patients:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
        else:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('added test count: ', test_count)
print('added train count: ', train_count)'''

In [35]:
with open('train_split.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['patientid', 'filename', 'label'])
    for sample in train:
        writer.writerow(sample)
        
with open('test_split.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['patientid', 'filename', 'label'])
    for sample in test:
        writer.writerow(sample)