In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import torch

from fastai.conv_learner import *
from fastai.dataset import *

from pathlib import Path
import json
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects

import multiprocessing
from functools import partial
import shutil
import itertools

### Distributed bulk resizing of images

In [2]:
# This function loads a file, resize it and write in the output folder
def img_resize(fname, outdir, sz, in_dir):
    '''
    fname: image filename
    outdir: relative path to output directory
    sz: final size of image
    in_dir: relative path to the input directory
    '''
    os.makedirs(outdir, exist_ok=True)
    im = cv2.imread(in_dir + fname)
    small_im = cv2.resize(im, (sz, sz))
    cv2.imwrite(outdir + fname, small_im)


def parallel_runs(data_list, outdir, in_dir, sz=300, process=4):
    '''
    data_list: list of filenames of images stores in a list
    outdir: relative path to output directory
    sz: final size of image
    in_dir: relative path to the input directory
    process: num of threads in your cpu
    '''
    pool = multiprocessing.Pool(processes=process)
    img_resize_x = partial(img_resize, outdir=outdir, sz=sz, in_dir=in_dir)
    pool.map(img_resize_x, data_list)

In [3]:
inputdir = 'data/images/'

filelist = !ls {inputdir}

outdir = 'data/nih_resized/'

In [None]:
parallel_runs(filelist,outdir,inputdir,300,12) 

### Looking at the labels

In [4]:
path = 'data/isazi_data/nih/'

In [5]:
bbox = pd.read_csv(path+'BBox_List_2017.csv')

FileNotFoundError: File b'data/isazi_data/nih/BBox_List_2017.csv' does not exist

In [6]:
bbox.columns

NameError: name 'bbox' is not defined

In [10]:
filelist = !ls {path+'resize'}

In [11]:
bbox.label.unique()

array(['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltrate', 'Mass', 'Nodule', 'Pneumonia',
       'Pneumothorax'], dtype=object)

In [12]:
bbox.groupby(['label']).size()

label
Atelectasis     180
Cardiomegaly    146
Effusion        153
Infiltrate      123
Mass             85
Nodule           79
Pneumonia       120
Pneumothorax     98
dtype: int64

Copying files with bounding image information to a separate folder

In [13]:
bb_imlist = list(bbox.im_fname) #list of images with bounding box

In [103]:
os.path.exists('data/exp1/imgs')

True

In [104]:
def bulk_copyfiles(filelist, source, destination, overwrite = True):
    '''
    filelist: list of filenames you need to copy
    source: source directory
    destination: destination directory
    '''
    if os.path.exists(destination) and overwrite: shutil.rmtree(destination)
    os.makedirs(destination, exist_ok=True)
    for fname in filelist:
        if os.path.exists(source + fname):
            shutil.copy(os.path.join(source, fname), destination)

In [127]:
bulk_copyfiles(bb_imlist, path + 'resize/', path + 'img_bbox/')

In [131]:
bbfilter = !ls {path+'img_bbox'}

In [135]:
bbox.shape

(984, 6)

In [132]:
len(bbfilter)

847

In [134]:
bbox.loc[bbox.im_fname.isin(bbfilter)].shape

(944, 6)

In [139]:
bbox.loc[bbox.im_fname == '00000732_005.png']

Unnamed: 0,im_fname,label,bb_x,bb_y,bb_w,bb_h
202,00000732_005.png,Cardiomegaly,427.932203,464.0,412.20339,344.949153
918,00000732_005.png,Pneumothorax,613.831111,110.686823,172.942222,103.537778


In [137]:
bbox.groupby('im_fname').size().reset_inde

im_fname
00000032_037.png    1
00000072_000.png    1
00000147_001.png    1
00000149_006.png    1
00000150_002.png    1
00000181_061.png    1
00000193_019.png    1
00000211_010.png    1
00000211_016.png    1
00000211_019.png    1
00000211_041.png    1
00000344_003.png    1
00000377_004.png    1
00000398_003.png    1
00000457_004.png    1
00000468_017.png    1
00000468_033.png    1
00000468_041.png    1
00000506_013.png    1
00000583_008.png    1
00000643_002.png    1
00000661_000.png    1
00000732_005.png    2
00000740_000.png    1
00000744_006.png    1
00000756_001.png    1
00000808_002.png    1
00000830_000.png    2
00000845_000.png    1
00000865_006.png    1
                   ..
00029817_009.png    1
00029843_001.png    1
00029861_013.png    2
00029894_000.png    1
00029906_000.png    1
00029909_003.png    1
00029940_007.png    1
00030039_008.png    1
00030106_008.png    1
00030111_007.png    1
00030128_002.png    1
00030162_026.png    1
00030162_029.png    2
00030206_013.png    1
0

In [15]:
filelist[0]

'00000001_000.png'

In [16]:
img = cv2.imread('data/isazi_data/nih/resize/00000001_000.png')

In [17]:
img.shape

(300, 300, 3)

## Image labels

In [188]:
pd.options.display.max_rows = 999

In [189]:
labels = pd.read_csv('data/Data_Entry_2017.csv')

In [9]:
labels.head(1000)

Unnamed: 0,img_id,labels,follow_up,patient_id,age,gender,view,width,height,pix_spacing_x,pix_spacing_y,labels_split,labels_cnt
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143000,0.143000,['Cardiomegaly'],1
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143000,0.143000,"['Cardiomegaly', 'Emphysema']",2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168000,0.168000,"['Cardiomegaly', 'Effusion']",2
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171000,0.171000,['No Finding'],1
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143000,0.143000,['Hernia'],1
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168000,0.168000,['Hernia'],1
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168000,0.168000,['Hernia'],1
7,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143000,0.143000,"['Hernia', 'Infiltration']",2
8,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168000,0.168000,['Hernia'],1
9,00000003_005.png,Hernia,5,3,78,F,PA,2686,2991,0.143000,0.143000,['Hernia'],1


In [190]:
pathology = {'Atelectasis': 11559,  'Cardiomegaly': 2776,  'Consolidation': 4667,  'Edema': 2303,  'Effusion': 13317,  'Emphysema': 2516,  'Fibrosis': 1686,  'Hernia': 227,  'Infiltration': 19894,  'Mass': 5782,  'Nodule': 6331,  'Pleural_Thickening': 3385,  'Pneumonia': 1431,  'Pneumothorax': 5302}

In [191]:
pathology.keys()

dict_keys(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'])

In [192]:
label_cnt = labels.groupby('labels').size().reset_index()

In [193]:
label_cnt.columns

Index(['labels', 0], dtype='object')

In [194]:
label_cnt = label_cnt.assign(labels_split = label_cnt['labels'].apply(lambda x: x.split('|')))

In [195]:
label_cnt = label_cnt.assign(label_count = label_cnt.labels_split.apply(lambda x: len(x)))

In [196]:
label_cnt.head()

Unnamed: 0,labels,0,labels_split,label_count
0,Atelectasis,4215,[Atelectasis],1
1,Atelectasis|Cardiomegaly,88,"[Atelectasis, Cardiomegaly]",2
2,Atelectasis|Cardiomegaly|Consolidation,10,"[Atelectasis, Cardiomegaly, Consolidation]",3
3,Atelectasis|Cardiomegaly|Consolidation|Edema,1,"[Atelectasis, Cardiomegaly, Consolidation, Edema]",4
4,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,2,"[Atelectasis, Cardiomegaly, Consolidation, Ede...",6


In [197]:
label_cnt.loc[label_cnt.label_count == 1].sort_values(by = 0, ascending= False)

Unnamed: 0,labels,0,labels_split,label_count
813,No Finding,60361,[No Finding],1
772,Infiltration,9547,[Infiltration],1
0,Atelectasis,4215,[Atelectasis],1
591,Effusion,3955,[Effusion],1
814,Nodule,2705,[Nodule],1
831,Pneumothorax,2194,[Pneumothorax],1
800,Mass,2139,[Mass],1
414,Consolidation,1310,[Consolidation],1
825,Pleural_Thickening,1126,[Pleural_Thickening],1
309,Cardiomegaly,1093,[Cardiomegaly],1


In [198]:
pathology_list = list(pathology.keys())

In [199]:
pathology_list

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [200]:
labels.shape

(112120, 13)

In [201]:
import collections

In [202]:
co_occur_dict = collections.defaultdict()
for i in itertools.combinations(pathology_list,2): 
    co_occur_dict[i] = labels['labels'].apply(lambda x: 1 if i[0] in x and i[1] in x else 0).sum()  

In [203]:
for i in sorted(co_occur_dict,key=lambda x: co_occur_dict[x], reverse=True): print(i,co_occur_dict[i])

('Effusion', 'Infiltration') 4000
('Atelectasis', 'Effusion') 3275
('Atelectasis', 'Infiltration') 3264
('Infiltration', 'Nodule') 1546
('Consolidation', 'Effusion') 1287
('Effusion', 'Mass') 1254
('Atelectasis', 'Consolidation') 1223
('Consolidation', 'Infiltration') 1221
('Infiltration', 'Mass') 1159
('Cardiomegaly', 'Effusion') 1063
('Effusion', 'Pneumothorax') 996
('Edema', 'Infiltration') 981
('Infiltration', 'Pneumothorax') 946
('Effusion', 'Nodule') 912
('Mass', 'Nodule') 906
('Effusion', 'Pleural_Thickening') 849
('Atelectasis', 'Pneumothorax') 774
('Infiltration', 'Pleural_Thickening') 750
('Emphysema', 'Pneumothorax') 747
('Atelectasis', 'Mass') 739
('Consolidation', 'Mass') 610
('Infiltration', 'Pneumonia') 605
('Edema', 'Effusion') 593
('Atelectasis', 'Nodule') 590
('Cardiomegaly', 'Infiltration') 587
('Atelectasis', 'Pleural_Thickening') 496
('Mass', 'Pleural_Thickening') 452
('Emphysema', 'Infiltration') 449
('Mass', 'Pneumothorax') 431
('Consolidation', 'Nodule') 428
('A

In [204]:
labels = labels.assign(labels_split = labels['labels'].apply(lambda x: x.split('|')))

In [205]:
labels.head()

Unnamed: 0,img_id,labels,follow_up,patient_id,age,gender,view,width,height,pix_spacing_x,pix_spacing_y,labels_split,labels_cnt
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,[Cardiomegaly],1
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,"[Cardiomegaly, Emphysema]",2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,"[Cardiomegaly, Effusion]",2
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,[No Finding],1
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,[Hernia],1


In [206]:
labels = labels.assign(labels_cnt = labels['labels_split'].apply(lambda x: len(x)))

In [207]:
labels.head()

Unnamed: 0,img_id,labels,follow_up,patient_id,age,gender,view,width,height,pix_spacing_x,pix_spacing_y,labels_split,labels_cnt
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,[Cardiomegaly],1
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,"[Cardiomegaly, Emphysema]",2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,"[Cardiomegaly, Effusion]",2
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,[No Finding],1
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,[Hernia],1


In [208]:
labels.to_csv('data/Data_Entry_2017.csv',index = False)

In [209]:
labels.shape

(112120, 13)

all the images with single pathology or no indication

In [252]:
labels_one = labels.loc[labels.labels_cnt==1] #labels with one pathology

In [253]:
labels_one.shape

(91324, 13)

In [254]:
labels_one.patient_id.nunique()

29564

In [255]:
pathology_list

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [256]:
path_to_keep = ['Atelectasis','Cardiomegaly']
# 'Atelectasis'
#  'Cardiomegaly',
#  'Consolidation',
#  'Effusion',
#  'Emphysema',
#  'Infiltration',
#  'Mass',
#  'Nodule',
#  'Pleural_Thickening',
#  'Pneumothorax'

dropping images with very rare disease

In [257]:
labels_one.groupby('labels').size()

labels
Atelectasis            4215
Cardiomegaly           1093
Consolidation          1310
Edema                   628
Effusion               3955
Emphysema               892
Fibrosis                727
Hernia                  110
Infiltration           9547
Mass                   2139
No Finding            60361
Nodule                 2705
Pleural_Thickening     1126
Pneumonia               322
Pneumothorax           2194
dtype: int64

In [258]:
labels_one = labels_one.loc[labels_one['labels'].isin(path_to_keep)]

In [259]:
labels_one.columns

Index(['img_id', 'labels', 'follow_up', 'patient_id', 'age', 'gender', 'view',
       'width', 'height', 'pix_spacing_x', 'pix_spacing_y', 'labels_split',
       'labels_cnt'],
      dtype='object')

In [260]:
labels_one.groupby('view').size()

view
AP    2352
PA    2956
dtype: int64

In [261]:
labels_one.shape

(5308, 13)

In [262]:
tmp = labels_one.groupby('patient_id')[['follow_up']].count().reset_index()

In [263]:
tmp1 = tmp.loc[tmp.follow_up == 1]

In [264]:
tmp1.head()

Unnamed: 0,patient_id,follow_up
0,1,1
1,8,1
2,11,1
3,13,1
6,38,1


In [265]:
tmp1 = list(tmp1.patient_id)

In [266]:
tmp2 = labels_one.loc[labels_one.patient_id.isin(tmp1)]

In [267]:
tmp2.head()

Unnamed: 0,img_id,labels,follow_up,patient_id,age,gender,view,width,height,pix_spacing_x,pix_spacing_y,labels_split,labels_cnt
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,[Cardiomegaly],1
23,00000008_000.png,Cardiomegaly,0,8,69,F,PA,2048,2500,0.171,0.171,[Cardiomegaly],1
34,00000011_006.png,Atelectasis,6,11,75,M,PA,2992,2991,0.143,0.143,[Atelectasis],1
83,00000013_045.png,Cardiomegaly,45,13,56,M,PA,2992,2991,0.143,0.143,[Cardiomegaly],1
185,00000038_003.png,Cardiomegaly,3,38,76,M,AP,3056,2544,0.139,0.139,[Cardiomegaly],1


In [271]:
tmp2.shape

(2373, 13)

In [272]:
len(set(tmp2.img_id))

2373

In [273]:
tmp2.groupby('labels').size()

labels
Atelectasis     1768
Cardiomegaly     605
dtype: int64

In [274]:
sample_normal_idx = np.random.choice(tmp2.loc[tmp2['labels']=='No Finding'].index,500,replace=False)

ValueError: a must be non-empty

In [235]:
path_idx = tmp2.loc[tmp2['labels']!='No Finding'].index

In [236]:
len(sample_normal_idx)

500

In [237]:
tot_idx = [sample_normal_idx,path_idx]

In [238]:
tot_idx = list(itertools.chain.from_iterable(tot_idx))

In [239]:
tot_idx

[94332,
 27980,
 47774,
 5841,
 47752,
 16008,
 20133,
 41406,
 46787,
 35060,
 50410,
 89512,
 107920,
 93136,
 103224,
 95336,
 104175,
 2136,
 88902,
 7261,
 64588,
 102923,
 70239,
 101155,
 49498,
 106944,
 65897,
 66112,
 34119,
 31836,
 91222,
 20045,
 95278,
 103412,
 53874,
 82308,
 4468,
 107988,
 84779,
 21164,
 64544,
 46384,
 49277,
 13448,
 97123,
 65306,
 92077,
 1058,
 106434,
 94963,
 94296,
 70748,
 70996,
 68208,
 20142,
 35154,
 45191,
 88022,
 95060,
 73661,
 957,
 106152,
 40196,
 19931,
 85876,
 107623,
 67229,
 105769,
 88219,
 32853,
 3757,
 42621,
 106446,
 80300,
 98448,
 31825,
 95217,
 54678,
 94405,
 29959,
 16325,
 24900,
 23998,
 64936,
 49943,
 100578,
 20955,
 52052,
 45357,
 32017,
 94357,
 97074,
 47751,
 111910,
 92090,
 96351,
 95177,
 57069,
 110875,
 51314,
 61300,
 58947,
 70721,
 94869,
 26591,
 79558,
 29552,
 8000,
 110325,
 14867,
 85310,
 50187,
 41972,
 111360,
 35680,
 66246,
 98787,
 40679,
 61926,
 95322,
 112020,
 73004,
 69392,
 10743

In [240]:
tmp3 = tmp2.loc[tot_idx]

In [241]:
tmp3.shape

(826, 13)

copying these files to another directory

In [275]:
filelist = list(tmp2.img_id)

In [276]:
len(filelist)

2373

In [277]:
filelist[0]

'00000001_000.png'

In [278]:
bulk_copyfiles(filelist,'data/nih_resized/','data/exp1/imgs/',True)

In [279]:
len(filelist)

2373

In [280]:
labels_one.head()

Unnamed: 0,img_id,labels,follow_up,patient_id,age,gender,view,width,height,pix_spacing_x,pix_spacing_y,labels_split,labels_cnt
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,[Cardiomegaly],1
23,00000008_000.png,Cardiomegaly,0,8,69,F,PA,2048,2500,0.171,0.171,[Cardiomegaly],1
34,00000011_006.png,Atelectasis,6,11,75,M,PA,2992,2991,0.143,0.143,[Atelectasis],1
83,00000013_045.png,Cardiomegaly,45,13,56,M,PA,2992,2991,0.143,0.143,[Cardiomegaly],1
111,00000030_000.png,Atelectasis,0,30,74,M,PA,2992,2991,0.143,0.143,[Atelectasis],1


In [281]:
exp1_label = tmp2[['img_id','labels']]

In [282]:
exp1_label.head()

Unnamed: 0,img_id,labels
0,00000001_000.png,Cardiomegaly
23,00000008_000.png,Cardiomegaly
34,00000011_006.png,Atelectasis
83,00000013_045.png,Cardiomegaly
185,00000038_003.png,Cardiomegaly


In [283]:
exp1_label['labels'] = exp1_label['labels'].apply(lambda x: "No_Finding" if x == "No Finding" else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [284]:
exp1_label.to_csv('data/exp1/exp1_labels.csv',index = False)