In [1]:
"""
Import libraries and setup DicomMetaParser object.
"""

import pandas as pd
from pathlib import Path
import sys
Path("output").mkdir(parents=True, exist_ok=True)
sys.path.append("../")

from lib.dicom import DicomMetaParser
from lib.utils import load
from lidc.utils import parse_subject

parser = DicomMetaParser("/data/lidc", parse_subject, n_workers=4)

In [2]:
"""
Run this cell to re-parse DICOM metadata.
"""

df = parser.parse()
df.to_pickle("output/metadata.pkl")
# Uncomment to check dataframe column properties
# parser.check_df(df)

Parsing DICOM metadata:   0%|          | 0/244527 [00:00<?, ?it/s]

Field: AnatomicRegionSequence is unhashable, setting to nan...
Field: VOILUTSequence is unhashable, setting to nan...
Field: AnatomicRegionSequence is unhashable, setting to nan...Field: AnatomicRegionSequence is unhashable, setting to nan...

Field: AnatomicRegionSequence is unhashable, setting to nan...
Field: VOILUTSequence is unhashable, setting to nan...Field: VOILUTSequence is unhashable, setting to nan...

Field: RequestAttributesSequence is unhashable, setting to nan...
Field: RequestAttributesSequence is unhashable, setting to nan...
Field: RequestAttributesSequence is unhashable, setting to nan...Field: RequestAttributesSequence is unhashable, setting to nan...

Field: RequestedProcedureCodeSequence is unhashable, setting to nan...
Field: PatientOrientationCodeSequence is unhashable, setting to nan...
Field: PatientOrientationCodeSequence is unhashable, setting to nan...
Field: ViewCodeSequence is unhashable, setting to nan...
Field: ViewCodeSequence is unhashable, setting to

In [3]:
"""
Run this cell to generate DICOM paths for creating dataset based on metadata.pkl generated above.
"""

df = pd.read_pickle("output/metadata.pkl")
subject_info = load("input/subject_info.pkl")

# Remove bad (blurry / movement artifacts) and negative subjects
df = df[~df["Subject"].isin(subject_info["bad"])]

df = df.query("PatientPosition == 'FFS' & PhotometricInterpretation == 'MONOCHROME2'")
is_primary = df.apply(lambda x: x["ImageType"][0] == "ORIGINAL" and x["ImageType"][1] == "PRIMARY", axis=1)
df = df[is_primary].reset_index(drop=True)

# Select DICOM series (based on SeriesNumber) with minimum slice thickness from each CT Type
df = parser.select_min(df, sub_category=[])

# Check that specified fields are consistent within each DICOM Series
df = parser.check_series_consistency(
    df,
    [
        "ImageOrientationPatient",
        "SeriesDescription",
        "PixelSpacing",
    ],
)

df["Type"] = "lung"

# Extract file paths for each unique DICOM series (based on Subject and Type)
paths_df = parser.to_path_list(df, sort_by="Path")
paths_df.to_pickle("output/ct_paths.pkl")
