In [1]:
import os
import pydicom
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
file=r"COVID-19-AR/manifest-1594658036421/COVID-19-AR/COVID-19-AR-16406488/02-14-2012-NA-CT PE CHEST-63916/2.000000-locator-16446/1-1.dcm"
# file=r"COVID-19-AR/manifest-1594658036421/COVID-19-AR/COVID-19-AR-16445151/03-08-2012-NA-XR CHEST AP PORTABLE-51919/1.000000-AP-45634/1-1.dcm"
# file=r"COVID-19-AR/manifest-1594658036421/COVID-19-AR/COVID-19-AR-16434395/03-03-2012-NA-XR CHEST AP PORTABLE-54576/1.000000-AP-50475/1-1.dcm"
# file=r"COVID-19-AR/manifest-1594658036421/COVID-19-AR/COVID-19-AR-16406545/01-17-2012-NA-XR CHEST AP PORTABLE-43479/1.000000-AP-36614/1-1.dcm"

ds = pydicom.dcmread(file)
ds.pixel_array.shape

(512, 512)

In [3]:
# plt.figure(figsize=(10, 10))
# plt.imshow(ds.pixel_array, cmap=plt.cm.bone)
# plt.show()

In [4]:
metadata=pd.read_csv('COVID-19-AR/manifest-1594658036421/metadata.csv')

In [5]:
clinical=pd.read_excel(r'COVID-19 Clinical.xlsx')

In [6]:
SubjectIDtoICUAdmit = dict(zip(clinical['PATIENT_ID'], clinical['ICU Admit']))

In [7]:
def readfiles(path):
    images = []
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            images.append(os.path.join(dirpath, f))
            
    return images

In [8]:
dataDic_PMS={}
dataDic_others={}
for index, row in metadata.iterrows():
    SOPClassName = row['SOP Class Name']
    SubjectID = row['Subject ID']
    label = SubjectIDtoICUAdmit[SubjectID] # ICU Admit 'Y' or 'N'
    Manufacturer = row['Manufacturer']
    nums = row['Number of Images']
    path = 'COVID-19-AR/manifest-1594658036421/' + os.path.relpath(row['File Location'])
    path = path.replace('\\','/') # windows decomment
    files1 = readfiles(path)
    files =[f for f in files1 if f.endswith('.dcm')]
    
    if len(files) != nums:
        print("len(files)!=nums")
        print('csv_nums: ', nums,', actual_nums: ', len(files))
        print(files)
        
    if Manufacturer == "Philips":
        l = dataDic_PMS.get(label, [])
        l += files
        dataDic_PMS[label]=l
        
    else:
        l=dataDic_others.get(label,[])
        l+=files
        dataDic_others[label]=l

In [9]:
dataPaths=[]
num_labels=[]
labels=list(dataDic_PMS.keys())
print("labels: ",labels)

labels:  ['N', 'Y']


In [10]:
for k in dataDic_PMS.keys():
    dataPaths += dataDic_PMS[k]
    lb = labels.index(k)
    num_labels += [lb for _ in range(len(dataDic_PMS[k]))]

In [11]:
train_str_list = []
for i in range(len(dataPaths)):
    train_str_list.append('%s------%d' % (dataPaths[i], num_labels[i]))

In [12]:
with open('PMS_dataset_all.txt','w', encoding='utf-8') as f:
    f.write('\n'.join(train_str_list))

## -----------------------------------------------------------------------------------

In [13]:
dataPaths = []
num_labels = []
labels=list(dataDic_others.keys())
print("labels: ",labels)

labels:  ['N', 'Y']


In [14]:
for k in dataDic_others.keys():
    dataPaths += dataDic_others[k]
    lb = labels.index(k)
    num_labels += [lb for _ in range(len(dataDic_others[k]))]

In [15]:
train_str_list=[]
for i in range(len(dataPaths)):
    train_str_list.append('%s------%d'%(dataPaths[i], num_labels[i]))

In [16]:
with open('others_dataset_all.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(train_str_list))

## ------------------------------------------------------------------------

In [17]:
dataPaths=[]
num_labels=[]
labels=list(dataDic_PMS.keys())
print("labels: ",labels)

labels:  ['N', 'Y']


In [18]:
for k in dataDic_PMS.keys():
    dataPaths += dataDic_PMS[k]
    lb = labels.index(k)
    num_labels += [lb for _ in range(len(dataDic_PMS[k]))]

In [19]:
x_train, x_test, y_train, y_test  = train_test_split(dataPaths, num_labels, test_size=0.2)
x_val, x_test, y_val, y_test  = train_test_split(x_test, y_test, test_size=0.5)

In [20]:
print("PMS")
print("all:   ", [num_labels.count(v) for v in range(len(labels))])
print("train: ", [y_train.count(v) for v in range(len(labels))])
print("test:   ", [y_test.count(v) for v in range(len(labels))])
print("val:   ", [y_val.count(v) for v in range(len(labels))])

PMS
all:    [20197, 8513]
train:  [16189, 6779]
test:    [2024, 847]
val:    [1984, 887]


In [21]:
train_str_list=[]
for i in range(len(x_train)):
    train_str_list.append('%s------%d'%(x_train[i], y_train[i]))

with open('PMS_dataset_train.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(train_str_list))

In [22]:
test_str_list=[]
for i in range(len(x_test)):
    test_str_list.append('%s------%d'%(x_test[i], y_test[i]))

with open('PMS_dataset_test.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(test_str_list))

In [23]:
val_str_list=[]
for i in range(len(x_val)):
    val_str_list.append('%s------%d'%(x_val[i], y_val[i]))

with open('PMS_dataset_val.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(val_str_list))

## ----------------------------------------------------------------

In [24]:
dataPaths=[]
num_labels=[]
labels=list(dataDic_others.keys())
print("labels: ",labels)

labels:  ['N', 'Y']


In [25]:
for k in dataDic_others.keys():
    dataPaths += dataDic_others[k]
    lb = labels.index(k)
    num_labels += [lb for _ in range(len(dataDic_others[k]))]

In [26]:
x_train, x_test, y_train, y_test  = train_test_split(dataPaths, num_labels, test_size=0.2)
x_val, x_test, y_val, y_test  = train_test_split(x_test, y_test, test_size=0.5)

In [27]:
print("others")
print("all:   ", [num_labels.count(v) for v in range(len(labels))])
print("train: ", [y_train.count(v) for v in range(len(labels))])
print("test:   ", [y_test.count(v) for v in range(len(labels))])
print("val:   ", [y_val.count(v) for v in range(len(labels))])

others
all:    [2090, 1135]
train:  [1668, 912]
test:    [213, 110]
val:    [209, 113]


In [28]:
train_str_list=[]
for i in range(len(x_train)):
    train_str_list.append('%s------%d'%(x_train[i], y_train[i]))

with open('others_dataset_train.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(train_str_list))

In [29]:
test_str_list=[]
for i in range(len(x_test)):
    test_str_list.append('%s------%d'%(x_test[i], y_test[i]))

with open('others_dataset_test.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(test_str_list))

In [30]:
val_str_list=[]
for i in range(len(x_val)):
    val_str_list.append('%s------%d'%(x_val[i], y_val[i]))

with open('others_dataset_val.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(val_str_list))

# ---------------------------------------------------------------

In [31]:
metadata=pd.read_csv('COVID-19-AR/manifest-1594658036421/metadata.csv')
clinical=pd.read_excel('COVID-19 Clinical.xlsx')
SubjectIDtoICUAdmit=dict(zip(clinical['PATIENT_ID'], clinical['ICU Admit']))

In [32]:
dataDic={}
for index, row in metadata.iterrows():
    SOPClassName = row['SOP Class Name']
    
    if 'x-ray' in SOPClassName.lower(): ###################################### main diff from before
        
        SubjectID = row['Subject ID']
        label = SubjectIDtoICUAdmit[SubjectID]
        nums = row['Number of Images']
        path = 'COVID-19-AR/manifest-1594658036421/' + os.path.relpath(row['File Location'])
        path = path.replace('\\','/')  # windows decomment
        files = readfiles(path)
        files = [f for f in files if f.endswith('.dcm')]
        if len(files) != nums:
            print("len(files) != nums")
            print('csv_nums: ',nums,', actual_nums: ',len(files))
            print(files)

        l = dataDic.get(label,[])
        l += files
        dataDic[label]=l

In [33]:
# x-ray
for k in dataDic.keys():
    print("%s: %d"%(k, len(dataDic[k])))

N: 134
Y: 102


In [34]:
dataPaths=[]
num_labels=[]
labels=list(dataDic.keys())
print("labels: ",labels)

labels:  ['N', 'Y']


In [35]:
for k in dataDic.keys():
    dataPaths += dataDic[k]
    lb = labels.index(k)
    num_labels += [lb for _ in range(len(dataDic[k]))]

In [36]:
x_train, x_test, y_train, y_test  = train_test_split(dataPaths, num_labels, test_size=0.25)
x_val, x_test, y_val, y_test  = train_test_split(x_test, y_test, test_size=0.5)

In [37]:
print("all:   ",[num_labels.count(v) for v in range(len(labels))])
print("train: ",[y_train.count(v) for v in range(len(labels))])
print("test:   ",[y_test.count(v) for v in range(len(labels))])
print("val:   ",[y_val.count(v) for v in range(len(labels))])

all:    [134, 102]
train:  [98, 79]
test:    [19, 11]
val:    [17, 12]


In [38]:
train_str_list=[]
for i in range(len(x_train)):
    train_str_list.append('%s------%d'%(x_train[i], y_train[i]))

In [39]:
with open('x_ray_train.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(train_str_list))

In [40]:
test_str_list=[]
for i in range(len(x_test)):
    test_str_list.append('%s------%d'%(x_test[i], y_test[i]))

In [41]:
with open('x_ray_test.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(test_str_list))

In [42]:
val_str_list=[]
for i in range(len(x_val)):
    val_str_list.append('%s------%d'%(x_val[i], y_val[i]))

with open('x_ray__val.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(val_str_list))

In [43]:
# all
for k in dataDic.keys():
  print("%s: %d"%(k, len(dataDic[k])))

N: 134
Y: 102
