In [1]:
from lib.dicom import DicomMetaParser
import pandas as pd
from typing import Dict, Any, List
import numpy as np

def parse_subject(path: str, *_: Any) -> str:
    return path.split("/")[-2].replace("subject_", "")

def classify_images(df: pd.DataFrame) -> pd.DataFrame:
    """
    Classify images into localizer, lung and mediastinum. Filter out other types
    """
    is_localizer = df["ScanOptions"] == "SURVIEW"
    is_lung = df["SeriesDescription"].str.contains("lung", na=False)
    is_med = df["SeriesDescription"].str.contains("med", na=False)
    df["Type"] = np.where(
        is_localizer,
        "localizer",
        np.where(is_lung, "lung", np.where(is_med, "med", "others")),
    )
    df = df.query("Type != 'others'").reset_index(drop=True)
    return df


def filter_all_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Select subjects with lung and med and localizer views
    """
    has_all_types = df.groupby("Subject").apply(lambda x: x["Type"].nunique() == 3)
    subjects = list(has_all_types[has_all_types].index)
    print(
        f"{len(subjects)}/{df['Subject'].nunique()} subjects has lung + med + localizer"
    )
    df = df[df["Subject"].isin(subjects)]
    return df.reset_index(drop=True)

In [2]:
parser = DicomMetaParser("/data/covid_ct", parse_subject)
# df = parser.parse()
# df.to_pickle("output/covid_ct_new.pkl")
df = pd.read_pickle("output/covid_ct_new.pkl")

In [12]:
from ct_positive import ct_train, ct_test
subject_map = pd.read_pickle("output/covid_ct_train_meta.pkl")["subject"]
positives = [str(subject_map[i]) for i in ct_train]

In [16]:
indices = pickle.load(open("output/pneumonia.pkl", "rb"))
len(indices["train"])

345

In [3]:
df = df.query("Subject != 133")
df = df[(~df["Subject"].isin(BAD)) & (~df["Subject"].isin(MISALIGNED))]

In [None]:
def select_dcms():
    # Read data and remove weird entry
    
    df = df.query("Subject != 133")
    df = df[(~df["Subject"].isin(BAD)) & (~df["Subject"].isin(MISALIGNED))]
    df = check_unique(df)
    df = check_series_consistency(
        df,
        [
            "ImagePositionPatient0",
            "ImagePositionPatient1",
            "ImageOrientationPatient",
            "SeriesDescription",
            "PixelSpacing0",
            "PixelSpacing1",
            "ImageType3",
        ],
    )
    df = classify_images(df)
    df = select_min(df)
    df = filter_all_types(df)

    # Save info for each subject
    groups = df.groupby(["Subject"])
    img_types = ["lung", "med", "localizer"]
    paths: List[Dict[str, Union[List[str], str]]] = []
    for i, (subject, group_df) in enumerate(groups):
        sorted_df = group_df.sort_values(by="SliceLocation")
        info = {
            "overall_index": i,
            "subject": subject,
        }
        for img_type in img_types:
            img_paths = sorted_df[sorted_df["Type"] == img_type]["Path"].tolist()
            info[img_type] = tuple(img_paths)

        paths.append(info)

    # # Shuffle paths
    Random(1035).shuffle(paths)
    print(f"Found {len(paths)} valid subjects.")

    train_df = pd.DataFrame.from_records(paths[:680])
    train_df.to_pickle(str(CONFIG.OUTPUT_DIR / "covid_ct_train_meta.pkl"))

    test_df = pd.DataFrame.from_records(paths[680:780])
    test_df.to_pickle(str(CONFIG.OUTPUT_DIR / "covid_ct_test_meta.pkl"))


In [None]:
parser = DicomMetaParser("/data/covid_ct", parse_subject)
df = parser.parse()

In [17]:
import pickle

In [19]:
indices = pickle.load(open("output/pneumonia.pkl", "rb"))
train_subject_map = pd.read_pickle("output/covid_ct_train_meta.pkl")["subject"]
test_subject_map = pd.read_pickle("output/covid_ct_test_meta.pkl")["subject"]
trains = [str(train_subject_map[i]) for i in set(indices["train"] + ct_train)]
tests = [str(test_subject_map[i]) for i in set(indices["test"] + ct_test)]
print(len(trains), len(tests))

640 95


In [22]:
trains

[284,
 693,
 816,
 719,
 55,
 505,
 322,
 262,
 712,
 977,
 989,
 843,
 1056,
 550,
 560,
 306,
 1088,
 653,
 807,
 858,
 18,
 829,
 761,
 607,
 502,
 98,
 311,
 242,
 999,
 552,
 1100,
 79,
 298,
 688,
 337,
 673,
 203,
 598,
 874,
 730,
 1037,
 413,
 280,
 602,
 379,
 190,
 400,
 664,
 1064,
 707,
 216,
 292,
 507,
 138,
 410,
 514,
 625,
 640,
 338,
 540,
 1060,
 1023,
 300,
 106,
 622,
 700,
 524,
 161,
 265,
 20,
 467,
 867,
 665,
 1011,
 856,
 173,
 498,
 188,
 666,
 42,
 1079,
 1019,
 784,
 10,
 359,
 1076,
 227,
 871,
 561,
 1086,
 506,
 405,
 346,
 837,
 975,
 87,
 401,
 1025,
 223,
 313,
 973,
 39,
 539,
 170,
 286,
 739,
 1103,
 1007,
 587,
 266,
 661,
 376,
 382,
 52,
 651,
 490,
 1069,
 348,
 399,
 1038,
 296,
 304,
 776,
 148,
 75,
 30,
 230,
 773,
 134,
 978,
 440,
 14,
 678,
 971,
 1001,
 58,
 635,
 31,
 810,
 610,
 140,
 221,
 327,
 511,
 146,
 765,
 979,
 404,
 737,
 491,
 178,
 351,
 789,
 334,
 714,
 370,
 594,
 168,
 728,
 1058,
 149,
 801,
 683,
 762,
 86,
 435,
 

In [None]:
pd.DataFrame({"train": trains, "test": tests})