In [1]:
import pandas as pd
import datetime

import os
from tqdm import tqdm

from PIL import Image

import numpy as np

from glob import glob
import random

In [2]:
import json

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyEncoder, self).default(obj)

# Create Train Json

In [3]:
train_pos = pd.read_csv('../train_pos_revised.csv')
train_neg = pd.read_csv('../train_neg_revised.csv')

In [5]:
train = pd.concat([train_pos, train_neg])
train_set = train.sort_values(by='sid', ascending=True).reset_index(drop=True)
data_info = pd.read_csv('../nii_1018.csv')

In [13]:
drr_path = '../data/only_20_original/train'
mask_path = '../data/only_20_original/mask_all'

coco_output = {}

coco_output['info'] = {
    "description": "LIDC-IDRI dataset with opencxr preprocessing",
    "url": "https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI",
    "version": "1.0.0",
    "year": 2022,
    "contributor": "Yisak Kim, Kyoungmin Jeon, Joohwan Lee, Dabin Seo",
    "date_created": datetime.datetime.utcnow().isoformat(' ')
}

coco_output['images'] = []
coco_output['annotations'] = []
check_double = []

for _, row in tqdm(train_set.iterrows()):
    coco_images_format = {
        "id": None,
        "width": None,
        "height": None,
        "file_name": None,
        "license": None,
        "flickr_url": None,
        "coco_url": None,
        "date_captured": None
    }
    
    coco_annos_format = {
            "id": None,
            "image_id": None,
            "category_id": None,
            "segmentation": None,
            "area": None,
            "bbox": None,
            "iscrowd": 0
        }
    
    if row.nod_path is np.nan:
        check_double.append(row.series_uid)
        study_id = row.study_id
        file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')
        
        coco_images_format['id'] = study_id
        coco_images_format['file_name'] = file_name
        coco_images_format['license'] = 0
        coco_images_format['width'] = 1024
        coco_images_format['height'] = 1024
        
        coco_output['images'].append(coco_images_format)
    else:
        img_info = data_info[data_info.series_uid == row.series_uid]
        if row.series_uid in check_double:
            pass
        else:
            check_double.append(img_info.series_uid.values[0])
            study_id = img_info.study_id.values[0]
            file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')

            coco_images_format['id'] = study_id
            coco_images_format['file_name'] = file_name
            coco_images_format['license'] = 0
            coco_images_format['width'] = 1024
            coco_images_format['height'] = 1024
            
            coco_output['images'].append(coco_images_format)
        
        mask_name = row.nod_path.split('/')[-1].replace('nii.gz', 'png')
        mask_img = Image.open(os.path.join(mask_path, mask_name))
        mask_img = np.array(mask_img)
        mask_img = mask_img[:, :, 0]
        
        nod_pos = np.where(mask_img >= 10)
        x, y = int(min(nod_pos[1])), int(min(nod_pos[0]))
        width = int(max(nod_pos[1]) - min(nod_pos[1]))
        height = int(max(nod_pos[0]) - min(nod_pos[0]))
        
        coco_annos_format['id'] = len(coco_output['annotations'])
        coco_annos_format['image_id'] = study_id
        coco_annos_format['category_id'] = 0
        coco_annos_format['area'] = width * height
        coco_annos_format['bbox'] = [x, y, width, height]

        coco_output['annotations'].append(coco_annos_format)

coco_output['categories'] = [
    {
        "id": 0,
        "name": "Nodule",
        "supercategory": "Nodule"
    }
]

coco_output['license'] = {
    "id": 0,
    "name": "Attribution-NonCommercial-ShareAlike License",
    "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}

with open('../train_opencxr_only20.json', 'w') as f:
    json.dump(coco_output, f, ensure_ascii=False, indent='\t', cls=NumpyEncoder)

774it [00:02, 303.71it/s]


In [8]:
valid_pos = pd.read_csv('../valid_pos_revised.csv')
valid_neg = pd.read_csv('../valid_neg_revised.csv')

valid = pd.concat([valid_pos, valid_neg])
valid_set = valid.sort_values(by='sid', ascending=True).reset_index(drop=True)
data_info = pd.read_csv('../preprocess/nii_1018.csv')

In [9]:
drr_path = '../data/original_final/valid'
mask_path = '../data/original_final/mask_all'

coco_output = {}

coco_output['info'] = {
    "description": "LIDC-IDRI dataset with opencxr preprocessing",
    "url": "https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI",
    "version": "1.0.0",
    "year": 2022,
    "contributor": "Yisak Kim, Kyoungmin Jeon, Joohwan Lee, Dabin Seo",
    "date_created": datetime.datetime.utcnow().isoformat(' ')
}

coco_output['images'] = []
coco_output['annotations'] = []
check_double = []

for _, row in tqdm(valid_set.iterrows()):
    coco_images_format = {
        "id": None,
        "width": None,
        "height": None,
        "file_name": None,
        "license": None,
        "flickr_url": None,
        "coco_url": None,
        "date_captured": None
    }
    
    coco_annos_format = {
            "id": None,
            "image_id": None,
            "category_id": None,
            "segmentation": None,
            "area": None,
            "bbox": None,
            "iscrowd": 0
        }
    
    if row.nod_path is np.nan:
        check_double.append(row.series_uid)
        study_id = row.study_id
        file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')
        
        coco_images_format['id'] = study_id
        coco_images_format['file_name'] = file_name
        coco_images_format['license'] = 0
        coco_images_format['width'] = 1024
        coco_images_format['height'] = 1024
        
        coco_output['images'].append(coco_images_format)
    else:
        img_info = data_info[data_info.series_uid == row.series_uid]
        if row.series_uid in check_double:
            pass
        else:
            check_double.append(img_info.series_uid.values[0])
            study_id = img_info.study_id.values[0]
            file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')

            coco_images_format['id'] = study_id
            coco_images_format['file_name'] = file_name
            coco_images_format['license'] = 0
            coco_images_format['width'] = 1024
            coco_images_format['height'] = 1024
            
            coco_output['images'].append(coco_images_format)
        
        mask_name = row.nod_path.split('/')[-1].replace('nii.gz', 'png')
        mask_img = Image.open(os.path.join(mask_path, mask_name))
        mask_img = np.array(mask_img)
        mask_img = mask_img[:, :, 0]
        
        nod_pos = np.where(mask_img >= 10)
        x, y = int(min(nod_pos[1])), int(min(nod_pos[0]))
        width = int(max(nod_pos[1]) - min(nod_pos[1]))
        height = int(max(nod_pos[0]) - min(nod_pos[0]))
        
        coco_annos_format['id'] = len(coco_output['annotations'])
        coco_annos_format['image_id'] = study_id
        coco_annos_format['category_id'] = 0
        coco_annos_format['area'] = width * height
        coco_annos_format['bbox'] = [x, y, width, height]

        coco_output['annotations'].append(coco_annos_format)

coco_output['categories'] = [
    {
        "id": 0,
        "name": "Nodule",
        "supercategory": "Nodule"
    }
]

coco_output['license'] = {
    "id": 0,
    "name": "Attribution-NonCommercial-ShareAlike License",
    "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}

with open('../valid_revised_opencxr.json', 'w') as f:
    json.dump(coco_output, f, ensure_ascii=False, indent='\t', cls=NumpyEncoder)

195it [00:01, 186.65it/s]


In [10]:
test_set = pd.read_csv('../test_pos_revised.csv')
data_info = pd.read_csv('../preprocess/nii_1018.csv')

In [11]:
drr_path = '../data/original_final/test'
mask_path = '../data/original_final/mask_all'

coco_output = {}

coco_output['info'] = {
    "description": "LIDC-IDRI dataset with opencxr preprocessing",
    "url": "https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI",
    "version": "1.0.0",
    "year": 2022,
    "contributor": "Yisak Kim, Kyoungmin Jeon, Joohwan Lee, Dabin Seo",
    "date_created": datetime.datetime.utcnow().isoformat(' ')
}

coco_output['images'] = []
coco_output['annotations'] = []
check_double = []

for _, row in tqdm(test_set.iterrows()):
    coco_images_format = {
        "id": None,
        "width": None,
        "height": None,
        "file_name": None,
        "license": None,
        "flickr_url": None,
        "coco_url": None,
        "date_captured": None
    }
    
    coco_annos_format = {
            "id": None,
            "image_id": None,
            "category_id": None,
            "segmentation": None,
            "area": None,
            "bbox": None,
            "iscrowd": 0
        }
    
    if row.nod_path is np.nan:
        check_double.append(row.series_uid)
        study_id = row.study_id
        file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')
        
        coco_images_format['id'] = study_id
        coco_images_format['file_name'] = file_name
        coco_images_format['license'] = 0
        coco_images_format['width'] = 1024
        coco_images_format['height'] = 1024
        
        coco_output['images'].append(coco_images_format)
    else:
        img_info = data_info[data_info.series_uid == row.series_uid]
        if row.series_uid in check_double:
            pass
        else:
            check_double.append(img_info.series_uid.values[0])
            study_id = img_info.study_id.values[0]
            file_name = row.img_path.split('/')[-1].replace('nii.gz', 'png')

            coco_images_format['id'] = study_id
            coco_images_format['file_name'] = file_name
            coco_images_format['license'] = 0
            coco_images_format['width'] = 1024
            coco_images_format['height'] = 1024
            
            coco_output['images'].append(coco_images_format)
        
        mask_name = row.nod_path.split('/')[-1].replace('nii.gz', 'png')
        mask_img = Image.open(os.path.join(mask_path, mask_name))
        mask_img = np.array(mask_img)
        mask_img = mask_img[:, :, 0]
        
        nod_pos = np.where(mask_img >= 10)
        x, y = int(min(nod_pos[1])), int(min(nod_pos[0]))
        width = int(max(nod_pos[1]) - min(nod_pos[1]))
        height = int(max(nod_pos[0]) - min(nod_pos[0]))
        
        coco_annos_format['id'] = len(coco_output['annotations'])
        coco_annos_format['image_id'] = study_id
        coco_annos_format['category_id'] = 0
        coco_annos_format['area'] = width * height
        coco_annos_format['bbox'] = [x, y, width, height]

        coco_output['annotations'].append(coco_annos_format)

coco_output['categories'] = [
    {
        "id": 0,
        "name": "Nodule",
        "supercategory": "Nodule"
    }
]

coco_output['license'] = {
    "id": 0,
    "name": "Attribution-NonCommercial-ShareAlike License",
    "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}

with open('../test_revised_opencxr.json', 'w') as f:
    json.dump(coco_output, f, ensure_ascii=False, indent='\t', cls=NumpyEncoder)

18it [00:00, 27.35it/s]
