### Chest-Xray Dataset Builder

 - To Generate the Xray datasets, download the links listed above :

    git clone https://github.com/ieee8023/covid-chestxray-dataset.git  
    git clone https://github.com/agchung/Figure1-COVID-chestxray-dataset.git  
    git clone https://github.com/agchung/Actualmed-COVID-chestxray-dataset.git  
    

 - Go to this <a href="https://www.kaggle.com/tawsifurrahman/covid19-radiography-database"> link </a> to download the COVID-19  - Radiography database. Only the COVID-19 image folder and metadata file is required. The overlaps between covid-chestxray-dataset are handled.  
 - Go to this <a href="https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data"> link </a> to download the RSNA pneumonia dataset.
 - Create a datasets directory and within the datasets directory, create covid and normal directory

In [None]:
! git clone https://github.com/ieee8023/covid-chestxray-dataset.git

In [None]:
! git clone https://github.com/agchung/Figure1-COVID-chestxray-dataset.git

In [None]:
! git clone https://github.com/agchung/Actualmed-COVID-chestxray-dataset.git

### Note :
1. Download dataset from https://www.kaggle.com/tawsifurrahman/covid19-radiography-database URL and save it in this location
2. Download dataset from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data URL and save it in this location 

### Install Libraries

In [None]:
! pip install kubeflow-fairing==0.7.2 kfserving xlrd pandas opencv-python pydicom pillow scikit-learn imutils minio kubernetes --user

### Restart Notebook Kernel

In [None]:
from IPython.display import display_html
display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2
import shutil
import logging
import yaml
from minio import Minio
from zipfile import ZipFile
from kubernetes import client as k8s_client
from kubernetes.client import rest as k8s_rest
from kubernetes import config as k8s_config
from kubernetes.client.rest import ApiException
from kubeflow.fairing.cloud.k8s import MinioUploader
from kubeflow.fairing.builders.cluster.minio_context import MinioContextSource

In [None]:
! unzip "576013_1042828_compressed_COVID-19 Radiography Database.zip"

In [None]:
# ! rm -rf "576013_1042828_compressed_COVID-19 Radiography Database.zip"

In [None]:
! unzip "rsna-pneumonia-detection-challenge.zip"

In [None]:
# ! rm -rf "rsna-pneumonia-detection-challenge.zip"

In [None]:
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
cohen_imgpath = 'covid-chestxray-dataset/images' 
cohen_csvpath = 'covid-chestxray-dataset/metadata.csv'

# path to covid-19 dataset from https://github.com/agchung/Figure1-COVID-chestxray-dataset
fig1_imgpath = 'Figure1-COVID-chestxray-dataset/images'
fig1_csvpath = 'Figure1-COVID-chestxray-dataset/metadata.csv'

# path to covid-19 dataset from https://github.com/agchung/Actualmed-COVID-chestxray-dataset
actmed_imgpath = 'Actualmed-COVID-chestxray-dataset/images'
actmed_csvpath = 'Actualmed-COVID-chestxray-dataset/metadata.csv'

# # path to covid-19 dataset from https://www.kaggle.com/tawsifurrahman/covid19-radiography-database
sirm_imgpath = 'COVID-19'
sirm_csvpath = 'COVID-19.metadata.xlsx'



In [None]:
! mkdir rsna-pneumonia-detection-challenge

In [None]:
! mv stage_2_detailed_class_info.csv rsna-pneumonia-detection-challenge

In [None]:
! mv stage_2_train_labels.csv rsna-pneumonia-detection-challenge
! mv stage_2_train_images rsna-pneumonia-detection-challenge

In [None]:
! mkdir datasets

In [None]:
! mkdir datasets/covid
! mkdir datasets/normal

In [None]:
# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
rsna_datapath = 'rsna-pneumonia-detection-challenge'
# get all the normal from here
rsna_csvname = 'stage_2_detailed_class_info.csv' 
# get all the 1s from here since 1 indicate pneumonia
# found that images that aren't pneunmonia and also not normal are classified as 0s
rsna_csvname2 = 'stage_2_train_labels.csv' 
rsna_imgpath = 'stage_2_train_images'

# parameters for COVIDx dataset
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Klebsiella'] = 'pneumonia'
mapping['Chlamydophila'] = 'pneumonia'
mapping['Legionella'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# train/test split
split = 0.1

# to avoid duplicates
patient_imgpath = {}

In [None]:
cohen_csv = pd.read_csv(cohen_csvpath, nrows=None)
#idx_pa = csv["view"] == "PA"  # Keep only the PA view
views = ["PA", "AP", "AP Supine", "AP semi erect", "AP erect"]
cohen_idx_keep = cohen_csv.view.isin(views)
cohen_csv = cohen_csv[cohen_idx_keep]

fig1_csv = pd.read_csv(fig1_csvpath, encoding='ISO-8859-1', nrows=None)
actmed_csv = pd.read_csv(actmed_csvpath, nrows=None)

sirm_csv = pd.read_excel(sirm_csvpath)


In [None]:
# lmed
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
covid_ds = {'cohen': [], 'fig1': [], 'actmed': [], 'sirm': []}

for index, row in cohen_csv.iterrows():
    f = row['finding'].split(',')[0] # take the first finding, for the case of COVID-19, ARDS
    if f in mapping: # 
        count[mapping[f]] += 1
        entry = [str(row['patientid']), row['filename'], mapping[f], 'cohen']
        filename_label[mapping[f]].append(entry)
        if mapping[f] == 'COVID-19':
            covid_ds['cohen'].append(str(row['patientid']))
        
for index, row in fig1_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0] # take the first finding
        if f in mapping: # 
            count[mapping[f]] += 1
            if os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.jpg')):
                entry = [row['patientid'], row['patientid'] + '.jpg', mapping[f], 'fig1']
            elif os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.png')):
                entry = [row['patientid'], row['patientid'] + '.png', mapping[f], 'fig1']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['fig1'].append(row['patientid'])

for index, row in actmed_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0]
        if f in mapping:
            count[mapping[f]] += 1
            entry = [row['patientid'], row['imagename'], mapping[f], 'actmed']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['actmed'].append(row['patientid'])
                
sirm = set(sirm_csv['URL'])
cohen = set(cohen_csv['url'])
discard = ['100', '101', '102', '103', '104', '105', 
           '110', '111', '112', '113', '122', '123', 
           '124', '125', '126', '217']

for idx, row in sirm_csv.iterrows():
    patientid = row['FILE NAME']
    if row['URL'] not in cohen and patientid[patientid.find('(')+1:patientid.find(')')] not in discard:
        count[mapping['COVID-19']] += 1
        imagename = patientid + '.' + row['FORMAT'].lower()
        if not os.path.exists(os.path.join(sirm_imgpath, imagename)):
            imagename = patientid.split('(')[0] + ' ('+ patientid.split('(')[1] + '.' + row['FORMAT'].lower()
        entry = [patientid, imagename, mapping['COVID-19'], 'sirm']
        filename_label[mapping['COVID-19']].append(entry)
        covid_ds['sirm'].append(patientid)

In [None]:
print('Data distribution from covid datasets:')
print(count)

### Grouping Covid Xrays

In [None]:

ds_imgpath = {'cohen': cohen_imgpath, 'fig1': fig1_imgpath, 'actmed': actmed_imgpath, 'sirm': sirm_imgpath}

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
    # random.sample(list(arr[:,0]), num_test)
    if key == 'pneumonia':
        test_patients = ['8', '31']
    elif key == 'COVID-19':
        test_patients = ['19', '20', '36', '42', '86', 
                         '94', '97', '117', '132', 
                         '138', '144', '150', '163', '169', '174', '175', '179', '190', '191'
                         'COVID-00024', 'COVID-00025', 'COVID-00026', 'COVID-00027', 'COVID-00029',
                         'COVID-00030', 'COVID-00032', 'COVID-00033', 'COVID-00035', 'COVID-00036',
                         'COVID-00037', 'COVID-00038',
                         'ANON24', 'ANON45', 'ANON126', 'ANON106', 'ANON67',
                         'ANON153', 'ANON135', 'ANON44', 'ANON29', 'ANON201', 
                         'ANON191', 'ANON234', 'ANON110', 'ANON112', 'ANON73', 
                         'ANON220', 'ANON189', 'ANON30', 'ANON53', 'ANON46',
                         'ANON218', 'ANON240', 'ANON100', 'ANON237', 'ANON158',
                         'ANON174', 'ANON19', 'ANON195',
                         'COVID-19(119)', 'COVID-19(87)', 'COVID-19(70)', 'COVID-19(94)', 
                         'COVID-19(215)', 'COVID-19(77)', 'COVID-19(213)', 'COVID-19(81)', 
                         'COVID-19(216)', 'COVID-19(72)', 'COVID-19(106)', 'COVID-19(131)', 
                         'COVID-19(107)', 'COVID-19(116)', 'COVID-19(95)', 'COVID-19(214)', 
                         'COVID-19(129)']
    else: 
        test_patients = []
    print('Key: ', key)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        if patient[0] not in patient_imgpath:
            patient_imgpath[patient[0]] = [patient[1]]
        else:
            if patient[1] not in patient_imgpath[patient[0]]:
                patient_imgpath[patient[0]].append(patient[1])
            else:
                continue  # skip since image has already been written
        if patient[0] in test_patients:
            if patient[3] == 'sirm':
                image = cv2.imread(os.path.join(ds_imgpath[patient[3]], patient[1]))
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                patient[1] = patient[1].replace(' ', '')
#                 covid area - test
                cv2.imwrite(os.path.join('datasets', 'covid', patient[1]), gray)
            else:
                if patient[-2] == 'COVID-19':
                    copyfile(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join('datasets', 'covid', patient[1]))
#                 elif patient[-2] == 'pneumonia':
#                     copyfile(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join('datasets', 'pneumonia', patient[1]))

            test.append(patient)
            test_count[patient[2]] += 1
        else:
            if patient[3] == 'sirm':
                image = cv2.imread(os.path.join(ds_imgpath[patient[3]], patient[1]))
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                patient[1] = patient[1].replace(' ', '')
#                 covid area - train
                cv2.imwrite(os.path.join('datasets', 'covid', patient[1]), gray)
            else:
                if patient[-2] == 'COVID-19':
                    copyfile(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join('datasets', 'covid', patient[1]))
#                 elif patient[-2] == 'pneumonia':
#                     copyfile(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join('datasets', 'pneumonia', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('test count: ', test_count)
print('train count: ', train_count)

### Grouping Normal Xrays

In [None]:
csv_normal = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

for key in patients.keys():
    arr = np.array(patients[key])
    if arr.size == 0:
        continue
    test_patients = np.load('rsna_test_patients_{}.npy'.format(key)) # random.sample(list(arr), num_test), download the .npy files from the repo.
    # np.save('rsna_test_patients_{}.npy'.format(key), np.array(test_patients))
    for patient in arr:
        if patient not in patient_imgpath:
            patient_imgpath[patient] = [patient]
        else:
            continue  # skip since image has already been written
                
        ds = dicom.dcmread(os.path.join(rsna_datapath, rsna_imgpath, patient + '.dcm'))
        pixel_array_numpy = ds.pixel_array
        imgname = patient + '.png'
        if patient in test_patients:
            if key =='normal':
                cv2.imwrite(os.path.join('datasets', 'normal', imgname), pixel_array_numpy)
#             elif key =='pneumonia':
#                 cv2.imwrite(os.path.join('datasets', 'pneumonia', imgname), pixel_array_numpy)
            test.append([patient, imgname, key, 'rsna'])
            test_count[key] += 1
        else:
            if key =='normal':
                cv2.imwrite(os.path.join('datasets', 'normal', imgname), pixel_array_numpy)
#             elif key =='pneumonia':
#                 cv2.imwrite(os.path.join('datasets', 'pneumonia', imgname), pixel_array_numpy)
            train.append([patient, imgname, key, 'rsna'])
            train_count[key] += 1

print('test count: ', test_count)
print('train count: ', train_count)

In [None]:
import shutil
shutil.make_archive("dataset", 'zip', "datasets")

## Connect to minio service using credentials

In [None]:
k8s_config.load_incluster_config()
api_client = k8s_client.CoreV1Api()
minio_service_endpoint = None

try:
    minio_service_endpoint = api_client.read_namespaced_service(name='minio-service', namespace='kubeflow').spec.cluster_ip
except ApiException as e:
    if e.status == 403:
        logging.warning(f"The service account doesn't have sufficient privileges "
                      f"to get the kubeflow minio-service. "
                      f"You will have to manually enter the minio cluster-ip. "
                      f"To make this function work ask someone with cluster "
                      f"priveleges to create an appropriate "
                      f"clusterrolebinding by running a command.\n"
                      f"kubectl create --namespace=kubeflow rolebinding "
                       "--clusterrole=kubeflow-view "
                       "--serviceaccount=${NAMESPACE}:default-editor "
                       "${NAMESPACE}-minio-view")
        logging.error("API access denied with reason: {e.reason}")

s3_endpoint = minio_service_endpoint
s3_endPoint = s3_endpoint+":9000"
minio_endpoint = "http://"+s3_endPoint
minio_username = "minio"
minio_key = "minio123"
minio_region = "us-east-1"
print(minio_endpoint)

## Define MinIO uploader

In [None]:
minio_uploader = MinioUploader(endpoint_url=minio_endpoint, minio_secret=minio_username, minio_secret_key=minio_key, region_name=minio_region)

## Create a MinIO bucket

In [None]:
minio_bucket = "imgzip"
minio_uploader.create_bucket(minio_bucket)
logging.info(f"Bucket {minio_bucket} created or already exists")

## Create a Minioclient object

In [None]:
minioClient = Minio(s3_endPoint,
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)
minioClient

## Upload zip file of images into minio bucket

In [None]:

zip_name = 'dataset.zip'
print(minioClient.fput_object(minio_bucket, zip_name, zip_name))


## List objects within minio bucket

In [None]:
model_response = minio_uploader.client.list_objects(Bucket=minio_bucket)
#print(model_response)
obj_key = model_response['Contents'][0]['Key']
print(obj_key)
# covid_obj_key = model_response['Contents'][0]['Key']
# normal_obj_key = model_response['Contents'][1]['Key']

## Download zip file from minio bucket

In [None]:
# minioClient.fget_object(minio_bucket, covid_obj_key, 'covid_dl.zip')
# minioClient.fget_object(minio_bucket, normal_obj_key, 'normal_dl.zip')
minioClient.fget_object(minio_bucket, obj_key, 'dataset_dl.zip')

## Delete minio bucket & contents

In [None]:
model_response = minio_uploader.client.list_objects(Bucket=minio_bucket)

obj_list = []
for obj_name in model_response['Contents']:
    obj_list.append({'Key' : obj_name['Key']})

minio_uploader.client.delete_objects(Bucket=minio_bucket, Delete={'Objects' : obj_list})
minio_uploader.client.delete_bucket(Bucket=minio_bucket)

In [None]:
list_response = minio_uploader.client.list_objects(Bucket=minio_bucket)
obj_key = list_response['Contents'][0]['Key']



In [None]:
#Download the image dataset zip files from MinIO bucket
minioClient.fget_object(minio_bucket, obj_key, 'dataset_dl.zip')

#Extract the zip files and store under the same directory
with ZipFile('dataset_dl.zip', 'r') as zipObj:
   # Extract all the contents of covid zip file into "image_dataset" directory
   zipObj.extractall('/home/jovyan/dataset')
