In [1]:
import pandas as pd
import os, json
import time
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import random

In [34]:
# !pip install pydicom
!pip install pylidc



In [35]:
import pydicom
import pylidc as pl
from pylidc.utils import consensus

In [36]:
myProjectID = "w210-capstone-2021"
# BigQuery
from google.cloud import bigquery
bq = bigquery.Client()
bigquery_client = bigquery.Client(project='w210-capstone-2021')

In [37]:
extract_metadata = """
WITH
  all_lidc_ct_series AS (
  SELECT
    DISTINCT(SeriesInstanceUID),
    StudyInstanceUID,  
    PatientID,
    SliceThickness,
    ARRAY_TO_STRING(PixelSpacing,"/") as pixelspa 
  FROM
    `canceridc-data.idc_views.dicom_all`
  WHERE
    Modality = "CT"
    AND collection_id = "lidc_idri"
    AND cast(SliceThickness as decimal) < 3.0
    )
SELECT PatientID,SliceThickness,pixelspa,StudyInstanceUID,SeriesInstanceUID FROM
  all_lidc_ct_series
ORDER BY
  PatientID
  """
patientList = bq.query(extract_metadata).to_dataframe() #Get a dataframe from the query data
print("dataset shape",patientList.shape)
patientList.head() #Take a peek at the data

dataset shape (897, 5)


Unnamed: 0,PatientID,SliceThickness,pixelspa,StudyInstanceUID,SeriesInstanceUID
0,LIDC-IDRI-0001,2.5,0.703125/0.703125,1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288...,1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636...
1,LIDC-IDRI-0002,1.25,0.681641/0.681641,1.3.6.1.4.1.14519.5.2.1.6279.6001.490157381160...,1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...
2,LIDC-IDRI-0003,2.5,0.820312/0.820312,1.3.6.1.4.1.14519.5.2.1.6279.6001.101370605276...,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...
3,LIDC-IDRI-0004,1.25,0.822266/0.822266,1.3.6.1.4.1.14519.5.2.1.6279.6001.191425307197...,1.3.6.1.4.1.14519.5.2.1.6279.6001.323541312620...
4,LIDC-IDRI-0005,2.5,0.664062/0.664062,1.3.6.1.4.1.14519.5.2.1.6279.6001.190188259083...,1.3.6.1.4.1.14519.5.2.1.6279.6001.129007566048...


In [39]:
base_gs_uri = 'gs://idc-tcia-lidc-idri/dicom/'
patientList['image_link'] = base_gs_uri + patientList['StudyInstanceUID'] + '/' + patientList['SeriesInstanceUID']
patientList['image_link'][0:5].to_csv("gcs_paths_3.txt",header=False, index=False)
!head gcs_paths_3.txt

gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192
gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.490157381160200744295382098329/1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417051974713149104919
gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.101370605276577556143013894866/1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615202213033480003264
gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.191425307197546732281885591780/1.3.6.1.4.1.14519.5.2.1.6279.6001.323541312620128092852212458228
gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.190188259083742759886805142125/1.3.6.1.4.1.14519.5.2.1.6279.6001.129007566048223160327836686225


In [23]:
!mkdir downloaded_cohort_3
!cat gcs_paths_3.txt | gsutil -u $myProjectID -m cp -Ir ./downloaded_cohort_3

Copying gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192/1.3.6.1.4.1.14519.5.2.1.6279.6001.117899712006236812875624866487.dcm...
Copying gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192/1.3.6.1.4.1.14519.5.2.1.6279.6001.115976726221266363067740350040.dcm...
Copying gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192/1.3.6.1.4.1.14519.5.2.1.6279.6001.100954823835603369147775570297.dcm...
Copying gs://idc-tcia-lidc-idri/dicom/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192/1.3.6.1.4.1.14519.5.2.1.6279.6001.142652236215375821016559311870.dcm...
Copying gs://idc-tcia-lidc-idri/dicom/1.

In [24]:
!git clone https://github.com/pieper/dicomsort.git
!pip install pydicom
!python dicomsort/dicomsort.py --help

Cloning into 'dicomsort'...
remote: Enumerating objects: 126, done.[K
remote: Total 126 (delta 0), reused 0 (delta 0), pack-reused 126[K
Receiving objects: 100% (126/126), 37.03 KiB | 131.00 KiB/s, done.
Resolving deltas: 100% (63/63), done.

% dicomsort.py --help
dicomsort [options...] sourceDir targetDir/<patterns>

 where [options...] can be:
    [-z,--compressTargets] - create a .zip file in the target directory
    [-d,--deleteSource] - remove source files/directories after sorting
    [-f,--forceDelete] - remove source without confirmation
    [-k,--keepGoing] - report but ignore dupicate target files
    [-v,--verbose] - print diagnostics while processing
    [-s,--symlink] - create a symlink to dicom files in sourceDir instead of copying them
    [-t,--test] - run the built in self test (requires internet)
    [-u,--unsafe] - do not replace unsafe characters with '_' in the path
    [--help] - print this message

 where sourceDir is directory to be scanned or "" (null string)

In [25]:
!python dicomsort/dicomsort.py -u downloaded_cohort_3 cohort_sorted_3/%PatientID/%StudyInstanceUID/%SeriesInstanceUID/%SOPInstanceUID.dcm

100%|████████████████████████████████████████| 908/908 [00:01<00:00, 610.84it/s]
Files sorted


In [26]:
data_folder = os.getcwd() + '/cohort_sorted_3/'
data_folder

'/home/jupyter/w210finalproject/cohort_sorted_3/'

In [27]:
f = open ('.pylidcrc','w')            #For GCP
# f = open ('/root/.pylidcrc','w')    #For Colab
f.write('[dicom]'+'\n')
f.write('path =' + data_folder +'\n')
f.write('warn = True')
f.close()

In [30]:
patient_list = os.listdir(data_folder)
patient_list

['LIDC-IDRI-0001',
 'LIDC-IDRI-0005',
 'LIDC-IDRI-0002',
 'LIDC-IDRI-0004',
 'LIDC-IDRI-0003']

In [32]:
pa=0
scan = pl.query(pl.Scan).filter(pl.Scan.patient_id.in_(patient_list))
nodules_annotation = scan[pa].cluster_annotations()
vol = scan[pa].to_volume()
vol.shape

Loading dicom files ... This may take a moment.


RuntimeError: Could not establish path to dicom files. Have you specified the `path` option in the configuration file /home/jupyter/.pylidcrc?

In [17]:
for pa in range(len(patient_list)):
    print ("Current patient ID =", patient_list[pa])
    
    scan = pl.query(pl.Scan).filter(pl.Scan.patient_id.in_(patient_list))
    nodules_annotation = scan[pa].cluster_annotations()
    vol = scan[pa].to_volume()
    images = []
    for i in range(vol.shape[2]):
        images.append(vol[:,:,i])
    images = np.vstack(images)
    np.save('data_npfiles/'+patient_list[pa]+'_image.npy', images)
    # CT_image_data.append(vol)
    
    cmask_CT = []
    cbbox_CT = []
    masks_CT = []
    for nodule_idx, nodule in enumerate(nodules_annotation):
        cmask, cbbox, masks = consensus(nodule)
        cmask_CT.append(cmask)
        cbbox_CT.append(cbbox)
        masks_CT.append(masks)   

    CT_mask = np.zeros_like(vol)    
    nodule_num = len(cmask_CT)
    
    for i in range(nodule_num):
        cmask = cmask_CT[i]
        cbbox = cbbox_CT[i]
        masks = masks_CT[i]    
        CT_mask[cbbox] += cmask
    masks = []
    for i in range(CT_mask.shape[2]):
        masks.append(CT_mask[:,:,i])
    masks = np.vstack(masks)
    np.save('data_npfiles/'+patient_list[pa]+'_mask.npy', masks)
    
    # CT_mask_data.append(CT_mask)

Current patient ID = LIDC-IDRI-0001
Loading dicom files ... This may take a moment.




RuntimeError: Could not establish path to dicom files. Have you specified the `path` option in the configuration file /home/jupyter/.pylidcrc?