In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import torch

from fastai.conv_learner import *
from fastai.dataset import *

from pathlib import Path
import json
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects

In [2]:
import multiprocessing
from functools import partial
import shutil

### Distributed bulk resizing of images

In [5]:
# This function loads a file, resize it and write in the output folder
def img_resize(fname, outdir, sz, in_dir):
    '''
    fname: image filename
    outdir: relative path to output directory
    sz: final size of image
    in_dir: relative path to the input directory
    '''
    os.makedirs(outdir, exist_ok=True)
    im = cv2.imread(in_dir + fname)
    small_im = cv2.resize(im, (sz, sz))
    cv2.imwrite(outdir + fname, small_im)


def parallel_runs(data_list, outdir, in_dir, sz=300, process=4):
    '''
    data_list: list of filenames of images stores in a list
    outdir: relative path to output directory
    sz: final size of image
    in_dir: relative path to the input directory
    process: num of threads in your cpu
    '''
    pool = multiprocessing.Pool(processes=process)
    img_resize_x = partial(img_resize, outdir=outdir, sz=sz, in_dir=in_dir)
    pool.map(img_resize_x, data_list)

In [6]:
inputdir = 'data/isazi_data/nih/images/'

filelist = !ls {inputdir}

outdir = 'data/isazi_data/nih/resize/'

In [82]:
parallel_runs(filelist,outdir,300,inputdir,4) 

### Looking at the labels

In [7]:
path = 'data/isazi_data/nih/'

In [8]:
bbox = pd.read_csv(path+'BBox_List_2017.csv')

In [9]:
bbox.columns

Index(['im_fname', 'label', 'bb_x', 'bb_y', 'bb_w', 'bb_h'], dtype='object')

In [10]:
filelist = !ls {path+'resize'}

In [11]:
bbox.label.unique()

array(['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltrate', 'Mass', 'Nodule', 'Pneumonia',
       'Pneumothorax'], dtype=object)

In [12]:
bbox.groupby(['label']).size()

label
Atelectasis     180
Cardiomegaly    146
Effusion        153
Infiltrate      123
Mass             85
Nodule           79
Pneumonia       120
Pneumothorax     98
dtype: int64

Copying files with bounding image information to a separate folder

In [13]:
bb_imlist = list(bbox.im_fname) #list of images with bounding box

In [14]:
def bulk_copyfiles(filelist, source, destination):
    '''
    filelist: list of filenames you need to copy
    source: source directory
    destination: destination directory
    '''
    for fname in filelist:
        if os.path.exists(source + fname):
            shutil.copy(os.path.join(source, fname), destination)

In [127]:
bulk_copyfiles(bb_imlist, path + 'resize/', path + 'img_bbox/')

In [131]:
bbfilter = !ls {path+'img_bbox'}

In [135]:
bbox.shape

(984, 6)

In [132]:
len(bbfilter)

847

In [134]:
bbox.loc[bbox.im_fname.isin(bbfilter)].shape

(944, 6)

In [139]:
bbox.loc[bbox.im_fname == '00000732_005.png']

Unnamed: 0,im_fname,label,bb_x,bb_y,bb_w,bb_h
202,00000732_005.png,Cardiomegaly,427.932203,464.0,412.20339,344.949153
918,00000732_005.png,Pneumothorax,613.831111,110.686823,172.942222,103.537778


In [137]:
bbox.groupby('im_fname').size().reset_inde

im_fname
00000032_037.png    1
00000072_000.png    1
00000147_001.png    1
00000149_006.png    1
00000150_002.png    1
00000181_061.png    1
00000193_019.png    1
00000211_010.png    1
00000211_016.png    1
00000211_019.png    1
00000211_041.png    1
00000344_003.png    1
00000377_004.png    1
00000398_003.png    1
00000457_004.png    1
00000468_017.png    1
00000468_033.png    1
00000468_041.png    1
00000506_013.png    1
00000583_008.png    1
00000643_002.png    1
00000661_000.png    1
00000732_005.png    2
00000740_000.png    1
00000744_006.png    1
00000756_001.png    1
00000808_002.png    1
00000830_000.png    2
00000845_000.png    1
00000865_006.png    1
                   ..
00029817_009.png    1
00029843_001.png    1
00029861_013.png    2
00029894_000.png    1
00029906_000.png    1
00029909_003.png    1
00029940_007.png    1
00030039_008.png    1
00030106_008.png    1
00030111_007.png    1
00030128_002.png    1
00030162_026.png    1
00030162_029.png    2
00030206_013.png    1
0

In [15]:
filelist[0]

'00000001_000.png'

In [16]:
img = cv2.imread('data/isazi_data/nih/resize/00000001_000.png')

In [17]:
img.shape

(300, 300, 3)