In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
!pip install glob2

In [None]:
!pip install pydicom kornia opencv-python scikit-image nbdev

In [None]:
!conda install -c conda-forge gdcm -y

In [None]:
image_df = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
study_df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
sub_df = pd.read_csv('../input/siim-covid19-detection/sample_submission.csv')

In [None]:
print(image_df.shape)
print(study_df.shape)
print(sub_df.shape)

In [None]:
image_df.head()

In [None]:
image_df['StudyInstanceUID'] = [x + '_study' for x in image_df['StudyInstanceUID']]
image_df.head()

In [None]:
study_df.head()

In [None]:
study_df = study_df.rename(columns={'id': 'StudyInstanceUID'})
study_df.head()

In [None]:
print(len(image_df.StudyInstanceUID.tolist()))
print(len(study_df.StudyInstanceUID.tolist()))

In [None]:
merger = pd.merge(image_df,study_df,on='StudyInstanceUID')
merger.head()

In [None]:
merger.StudyInstanceUID.value_counts()

In [None]:
count = merger.StudyInstanceUID.value_counts().reset_index()
count.columns = ['StudyInstanceUID', 'number of imgs']
count = count.sort_values(by = 'number of imgs', ascending=False)
count.head()

In [None]:
merger["Nb_boxes"] = merger.boxes.str.split('},')
merger['Nb_boxes'] = merger['Nb_boxes'].fillna('0 0 1 1')
merger["len"] = [1 if box=='0 0 1 1' else len(box) for box in merger['Nb_boxes'].values]
merger.head()

In [None]:
merger['Nb_boxes'][0]

In [None]:
new = merger.loc[merger.index.repeat(merger['len'])]
new = merger.explode('Nb_boxes').reset_index(drop=True)
new.head()


In [None]:
#'\[{|{|}\]'
new['Nb_boxes'] = new['Nb_boxes'].str.replace("\[{|{|}\]|'x':||'y':|'width':|'height':|,",'')
new.head()

In [None]:

new[["x", "y","h", "w"]] = pd.DataFrame([x.split() for x in new.Nb_boxes.tolist()], index= new.index)
xmin = new.loc[:, 'x'].astype('float')
ymin = new.loc[:, 'y'].astype('float')
new['x'] = new.loc[:, 'x'].astype('float')
new['y'] = new.loc[:, 'y'].astype('float')
new['xmax'] = xmin + new.loc[:, 'w'].astype('float')
new['ymax'] = ymin + new.loc[:, 'h'].astype('float')
new.head()

In [None]:
import re

new['label'] = new['label'].str.replace(r"[0-9]|[.]|-",'')
new['label'] = new['label'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')

new.head()

In [None]:
new.label.value_counts()

In [None]:
new = new.drop(columns=['boxes','Nb_boxes','len'])
new.head()

In [None]:
new['id'] = new['id'].str.replace('_image','.dcm')
new['StudyInstanceUID'] = new['StudyInstanceUID'].str.replace('_study','')
new.head()

In [None]:
new['label'] = new['label'].str.replace(' ','')

In [None]:
import glob2

img_path = glob2.glob('../input/siim-covid19-detection/train/**/*.dcm')

In [None]:
new['path'] = [path for file in new['id'].tolist() for path in img_path if file==str(path).split('/')[-1]]
new.head()

In [None]:
# save the new csv and add it to your data
new.to_csv('siim_final2.csv', index=False)

In [None]:
## from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *

import pydicom
import gdcm



In [None]:
df = pd.read_csv('../input/siim-competition/siim_final2.csv')
df.head()

In [None]:
imgs =[]
bbox = []
lbl_box = []

for item in df['id'].unique():
    data = df[df['id'] == item]
    boxes = data[['x','y','xmax','ymax']].values.tolist()
    lbl = data['label'].values.tolist()
    imgs.append(item)
    bbox.append(boxes)
    lbl_box.append(lbl)

In [None]:
imgs[0]

In [None]:
lbl_bbox = list(zip(bbox,lbl_box))
lbl_bbox[0]

In [None]:
img2bbox = dict(zip(imgs, lbl_bbox))

In [None]:
img2bbox['fed009ec0e3f.dcm'][0]

In [None]:
first = {k: img2bbox[k] for k in list(img2bbox)[:1]}; first

In [None]:
class PILDicom(PILBase):
    _open_args,_tensor_cls,_show_args = {},TensorDicom,TensorDicom._show_args
    @classmethod
    def create(cls, fn:(Path,str,bytes), mode='RGB')->None:
        "Open a `DICOM file` from path `fn` or bytes `fn` and load it as a `PIL Image`"
        if isinstance(fn,bytes): im = Image.fromarray(pydicom.dcmread(pydicom.filebase.DicomBytesIO(fn)).pixel_array)
        if isinstance(fn,(Path,str)): im = pydicom.dcmread(fn).pixel_array
        im = (im / np.max(im)) * 255
        #im = (im.windowed(w=40, l=80))*255
        im = Image.fromarray(im.astype(np.uint8))
        #im = Image.fromarray(im)
        im.load()
        im = im._new(im.im)
        return cls(im.convert(mode) if mode else im)


In [None]:
pneumothorax = DataBlock(blocks=(ImageBlock(cls=PILDicom), BBoxBlock,BBoxLblBlock),
                   get_items=get_dicom_files,
                   splitter=RandomSplitter(),    
                   get_y=[lambda o: img2bbox[o.name][0], lambda o: img2bbox[o.name][1]],
                   item_tfms=Resize(224),
                   batch_tfms=aug_transforms(),
                 n_inp=1)


In [None]:
dls = pneumothorax.dataloaders('../input/siim-covid19-detection/train')
dls.show_batch(max_n=16)


In [None]:
x,y,z = next(iter(dls[0]))

In [None]:
#x,y,z