# Data Query Script

This script queries the data from the LIDC dataset using PyLIDC. It then resizes the data saves it for further preprocessing in the ML_preprocessing script. 

In [1]:
import pylidc as pl
import numpy as np
import matplotlib.pyplot as plt
import cv2
from scipy.ndimage import zoom

The functions below are called to resize and label the data. The labels are 0 for benign and 1 for malignant. 

compress2d takes in a list of nodule slices, and returns a tuple containing the resized image along with the label of the image. It resizes the image into 227x227. 

compress3d takes in a list of lists of nodules, and for each inner nodule list it resizes the individual nodule slices. It returns a tuple containing a list of resized nodule slices belonging to a single nodule, along with a label. 

In [4]:
# parameters:
# arg1: slices: a list of images to be resized
# arg2: cancer: a boolean, set true for images to be labelled as cancer and false for images to be labelled as benign

# return:
# a tuple containing the resized image and its label

def compress2d(slices, cancer=True):
    newSlices = []  
    if cancer == True:
        label = 1
    else:
        label = 0
    for slice in slices:
        y = cv2.resize(np.array(slice),(227,227))      
        newSlices.append([y, label])
    return newSlices

# parameters:
# arg1: slices: a list of lists of images to be resized
# arg2: cancer: a boolean, set true for a list of images to be labelled as cancer and false for benign

# return:
# a tuple containing the a list of resized images and its label

def compress3d(slices, cancer=True):
    newSlices = []  
    for slice in slices:
        y = cv2.resize(np.array(slice),(111,111))
        newSlices.append(y)        
    l = len(slices)
    x = l/20
    if cancer == True:
        label = 1
        return [zoom(newSlices, (1/x, 1, 1)), label]
    else:
        label = 0
        return [zoom(newSlices, (1/x, 1, 1)), label]

The malignancy estimate here refers to the LIDC annotation labels as follows:

1:  Highly Unlikely	
2:	Moderately Unlikely	
3:	Indeterminate	
4:	Moderately Suspicious	
5:	Highly Suspicious		

is_cancer is set to True when querying labels 4 and 5, and False when querying 1 and 2.

The data is queried and filtered by malignancy label.

In [5]:
malignancy_estimate = 1
is_cancer = True

if (malignancy_estimate == 1) or (malignancy_estimate == 2):
    is_cancer = False
    
ann = pl.query(pl.Annotation).filter(pl.Annotation.malignancy == malignancy_estimate)
padding = [(30,10), (10,25), (0,0)]

# 2D

When querying data for 2D CNN training, follow the next cell up until the "3D" heading. 

In [6]:
dimension = "2d"
start_at = 0
# if querying until the end of the query size, use ann.count() for stop_at
stop_at = 300

nodule_slices_2d = []

# querying of the nodules is done in batches of around 300 due to the memory limitations
for nodule_slice in ann[start_at:stop_at]:
    vol = nodule_slice.scan.to_volume()
    bbox = nodule_slice.bbox(pad=padding)
    y = vol[bbox]
    for i in range(vol[bbox].shape[2]):
        nodule_slices_2d.append(vol[bbox][:,:,i])
        
# the list is then converted into a numpy array for further preprocessing
np_nodules_2d = np.array(nodule_slices_2d)

Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a 

In [7]:
# Resize and label the data
compressed_slices_2d = compress2d(np_nodules_2d, is_cancer)

In [12]:
# Save the data, and if needed, restart the querying process with a new batch
file_name = "{0}_Malignancy_{1}_{2}:{3}.npy".format(dimension, malignancy_estimate, start_at, stop_at)
np.save(file_name, compressed_slices_2d)

# 3D

When querying data for the 3D model, begin at the next cell and run to the end of the script.

In [4]:
dimension = "3d"
start_at = 0
# if querying until the end of the query size, use ann.count() for stop_at
stop_at = 300

nodules_3d = []

# querying of the nodules is done in batches of around 300 due to the memory limitations. All nodule slices corresponding to the
# same nodule are grouped together in a single list. Thus, by the end of this loop, nodules_3d will be a list of lists
# with each list holding all slices of a particular nodule
for nodule in ann[start_at:stop_at]:
    nodule_container = []
    vol = nodule.scan.to_volume()
    bbox = nodule.bbox(pad=padding)
    y = vol[bbox]
    for i in range(vol[bbox].shape[2]):
        nodule_container.append(vol[bbox][:,:,i])
    nodules_3d.append(nodule_container)
    
np_nodules_3d = []

for nodule in nodules_3d:
    # each list of nodule slices corresponding to a single nodule is converted into a numpy array for further preprocessing
    np_nodules_3d.append(np.array(nodule))

Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a moment.
Loading dicom files ... This may take a 

In [6]:
# resize and label the 3d data 
compressed_nodules_3d = [compress3d(nodule, is_cancer) for nodule in np_nodules_3d]

In [9]:
# save the data
file_name = "{0}_Malignancy_{1}_{2}:{3}.npy".format(dimension, malignancy_estimate, start_at, stop_at)
np.save(file_name, compressed_nodules_3d)