In [13]:
import os
import shutil
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from sklearn.model_selection import GroupKFold

ref. https://www.kaggle.com/code/miladlink/nih-yolo-dataset-yolo-labels

In [2]:
lbls_dir = '~/ta-hsi-datacenter3/Medical/NIH_CXR8/BBox_List_2017.csv'
lbls = pd.read_csv (lbls_dir)
lbls = lbls.drop (lbls.iloc [:, 6:9], axis = 1)
lbls.head ()

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h]
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695


In [3]:
max(lbls.iloc[::,0])

'00030674_000.png'

In [4]:
print ('number of all boxes:', len (lbls))
print ('number of unique patients:',len (lbls ['Image Index'].unique ()))

number of all boxes: 984
number of unique patients: 880


In [5]:
pathology_list = []
i = 0
for class_name in lbls ['Finding Label']:
    if class_name not in pathology_list:
        pathology_list.append (class_name)
pathology_list

['Atelectasis',
 'Cardiomegaly',
 'Effusion',
 'Infiltrate',
 'Mass',
 'Nodule',
 'Pneumonia',
 'Pneumothorax']

### class to class_id

In [6]:
lbls ['class_id'] = lbls ['Finding Label'].apply (lambda x: pathology_list.index (x))
lbls.head ()

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],class_id
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,0
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,0
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,0
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,0
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,0


In [7]:
lbls.groupby(["Finding Label","class_id"])["class_id"].count(), lbls["class_id"].count()

(Finding Label  class_id
 Atelectasis    0           180
 Cardiomegaly   1           146
 Effusion       2           153
 Infiltrate     3           123
 Mass           4            85
 Nodule         5            79
 Pneumonia      6           120
 Pneumothorax   7            98
 Name: class_id, dtype: int64,
 984)

### Normalize bounding box

In [8]:
lbls = lbls.rename (columns = {'Bbox [x': 'x', 'h]': 'h'})
lbls ['x'] = lbls ['x'].apply (lambda r: r / 1024)
lbls ['y'] = lbls ['y'].apply (lambda r: r / 1024)
lbls ['w'] = lbls ['w'].apply (lambda r: r / 1024)
lbls ['h'] = lbls ['h'].apply (lambda r: r / 1024)
lbls.head ()

Unnamed: 0,Image Index,Finding Label,x,y,w,h,class_id
0,00013118_008.png,Atelectasis,0.219809,0.534198,0.084746,0.077331,0
1,00014716_007.png,Atelectasis,0.670021,0.12846,0.181144,0.306144,0
2,00029817_009.png,Atelectasis,0.216631,0.309622,0.151483,0.211864,0
3,00014687_001.png,Atelectasis,0.709216,0.483351,0.137712,0.054025,0
4,00017877_001.png,Atelectasis,0.644597,0.556427,0.195975,0.076271,0


### Add path of images

In [10]:
# adding path to dataframe
all_image_paths = {os.path.basename(x): x for x in glob ('images/images_*/*.png')}
lbls ['path'] = lbls ['Image Index'].map(all_image_paths.get)
lbls.head ()

Unnamed: 0,Image Index,Finding Label,x,y,w,h,class_id,path
0,00013118_008.png,Atelectasis,0.219809,0.534198,0.084746,0.077331,0,images/images_06/00013118_008.png
1,00014716_007.png,Atelectasis,0.670021,0.12846,0.181144,0.306144,0,images/images_07/00014716_007.png
2,00029817_009.png,Atelectasis,0.216631,0.309622,0.151483,0.211864,0,images/images_12/00029817_009.png
3,00014687_001.png,Atelectasis,0.709216,0.483351,0.137712,0.054025,0,images/images_07/00014687_001.png
4,00017877_001.png,Atelectasis,0.644597,0.556427,0.195975,0.076271,0,images/images_08/00017877_001.png


Concat boxes and class ids and remove additional

In [11]:
lbls ['boxes'] = lbls.apply (lambda row: [row.class_id, row.x, row.y, row.w, row.h], axis = 1)
lbls = lbls.drop (['Image Index', 'Finding Label', 'x', 'y', 'w', 'h', 'class_id'], axis = 1)
lbls.head ()

Unnamed: 0,path,boxes
0,images/images_06/00013118_008.png,"[0, 0.2198093220338984, 0.5341984538708701, 0...."
1,images/images_07/00014716_007.png,"[0, 0.6700211864406778, 0.1284604476670088, 0...."
2,images/images_12/00029817_009.png,"[0, 0.21663135593220312, 0.3096221826844297, 0..."
3,images/images_07/00014687_001.png,"[0, 0.709216101694915, 0.48335099624375194, 0...."
4,images/images_08/00017877_001.png,"[0, 0.6445974576271182, 0.5564265493619238, 0...."


## Split

Use Groupfold to better split

In [14]:
gkf  = GroupKFold(n_splits = 10)
lbls ['fold'] = -1
for fold, (train_idx, val_idx) in enumerate (gkf.split (lbls, groups = lbls.path.tolist())):
    lbls.loc [val_idx, 'fold'] = fold
lbls.head ()

Unnamed: 0,path,boxes,fold
0,images/images_06/00013118_008.png,"[0, 0.2198093220338984, 0.5341984538708701, 0....",9
1,images/images_07/00014716_007.png,"[0, 0.6700211864406778, 0.1284604476670088, 0....",3
2,images/images_12/00029817_009.png,"[0, 0.21663135593220312, 0.3096221826844297, 0...",8
3,images/images_07/00014687_001.png,"[0, 0.709216101694915, 0.48335099624375194, 0....",1
4,images/images_08/00017877_001.png,"[0, 0.6445974576271182, 0.5564265493619238, 0....",9


Seprate unique patient ids

In [15]:
train_files = []
valid_files  = []
valid_files += list (lbls [lbls.fold == 9].path.unique())
train_files += list (lbls [lbls.fold != 9].path.unique())
print ('number of unique trian images:', len (train_files))
print ('number of unique valid images:', len (valid_files))
train_files [:3]

number of unique trian images: 792
number of unique valid images: 88


['images/images_07/00014716_007.png',
 'images/images_12/00029817_009.png',
 'images/images_07/00014687_001.png']

### Create labels folder and labels usig .txt file

In [16]:
!rm -r labels
!mkdir labels
for i in range (len (lbls)):
    fname = os.path.basename (lbls.iloc [i, 0]).replace ('png', 'txt')
    with open (f'labels/{fname}', 'a') as f:
        for j in range (5):
            f.write (str (lbls.iloc [i, 1][j]) + ' ')
        f.write ('\n')

rm: cannot remove 'labels': No such file or directory


### Create NIH Dataset for YOLO Detection

In [17]:
os.makedirs('NIH/train/images', exist_ok = True)
os.makedirs('NIH/train/labels', exist_ok = True)
os.makedirs('NIH/valid/images', exist_ok = True)
os.makedirs('NIH/valid/labels', exist_ok = True)

for files in [train_files, valid_files]:
    for file in tqdm (files):
        fname = file.split ('/')[-1].split ('.')[0]
        if files == train_files:
            shutil.copy (file, 'NIH/train/images/' + fname + '.png')
            shutil.copy (os.path.join ('labels', fname + '.txt'), 'NIH/train/labels/' + fname + '.txt')
        else:
            shutil.copy (file, 'NIH/valid/images/' + fname + '.png')
            shutil.copy (os.path.join ('labels', fname + '.txt'), 'NIH/valid/labels/' + fname + '.txt')

  0%|          | 0/792 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]