In [8]:
import glob
import h5py
import math
import matplotlib.pyplot as plt
import openslide
import os
import pandas as pd
import random

from data_processing import *
from PIL import Image

## Constants

In [9]:
# Paths to raw TMA files.
PATH_TO_RAW_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/raw"
PATH_TO_IMAGES = os.path.join(PATH_TO_RAW_DATA, "svs")

# Path to processed data (extracted TMA patches)
PATH_TO_PROCESSED_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed"
PATH_TO_TMA_PATCHES = os.path.join(PATH_TO_PROCESSED_DATA, "tma_patches")

# Path to annotations and labels (diagnoses)
PATH_TO_ANNOTATIONS_CSV = os.path.join(PATH_TO_RAW_DATA, "cores")
PATH_TO_DIAGNOSES = os.path.join(PATH_TO_RAW_DATA, "core_labels.csv")

In [10]:
# Get the list of paths to TMA svs files and TMA annotations csv files.
# We want the tma_slides_paths[i] to correspond to tma_annotations_paths[i] (hence, the calls to "sort").
tma_slides_paths = sorted(glob.glob(PATH_TO_IMAGES + "/*_TMA*.svs"), key= lambda s : s.split("_")[1])
tma_annotations_paths = sorted(glob.glob(PATH_TO_ANNOTATIONS_CSV + "/TMA*_annotations.csv"))

## Read Raw Data for all TMAs

In [11]:
tma_ids = [1, 2, 3, 4, 5, 6, 6, 8]
tma_slides = [openslide.OpenSlide(tma_slide) for tma_slide in tma_slides_paths]
tma_annotations = [pd.read_csv(filename) for filename in tma_annotations_paths]

In [12]:
tma_case_to_diagnosis = pd.read_csv(PATH_TO_DIAGNOSES)
tma_case_to_diagnosis.head()

Unnamed: 0.1,Unnamed: 0,TMA ID,CASE,2017 WHO DIAGNOSIS,CLPA Diagnostic Bin,label
0,0,1,E0001 B,NOT ON TMA,Excluded,-1
1,1,1,E0002 B,NON-DIAGNOSTIC,Excluded,-1
2,2,1,E0003 B,Classic Hodgkin Lymphoma,HL,1
3,3,1,E0004 B,"Follicular lymphoma, grade 1-2",FL,3
4,4,1,E0005 B,"Diffuse large B cell lymphoma, NOS",DLBCL,0


In [13]:
CASE = "CASE"
WHO_DIAGNOSIS = "2017 WHO DIAGNOSIS"
CLPA_DIAGNOSIS = "CLPA Diagnostic Bin"
LABEL = "label"

def get_field_by_patient_id(tma_id, patient_id, field):
    missing_ids = set(["placenta", "tonsil"])
    
    add_b_ids = set(["E0184", "E0147", "E0137"])
    add_a_ids = set(["E0307"])
    
    if patient_id in missing_ids:
        print(f"Could not find {field} for: {patient_id}")
        return None
    elif patient_id in add_b_ids:
        patient_id += " B"
    elif patient_id in add_a_ids:
        patient_id += " A"
    elif not patient_id.rstrip()[-2].isspace() and patient_id.rstrip()[-1].isalpha():  # Add space between alphabet and number: "E0456B" -> "E0456 B"
        patient_id = patient_id[:-1] + " " + patient_id[-1]
    
    condition = (tma_case_to_diagnosis[CASE] == patient_id) & (tma_case_to_diagnosis["TMA ID"] == tma_id)
    
    if len(tma_case_to_diagnosis[condition][field].values) == 0:
        print(f"Could not find {field} for: {patient_id}")
        return None
    
    return tma_case_to_diagnosis[condition][field].values[0]

print(get_field_by_patient_id(1, "E0090 C", WHO_DIAGNOSIS))
print(get_field_by_patient_id(4, "E0456 B", WHO_DIAGNOSIS))

NON-DIAGNOSTIC
Diffuse large B cell lymphoma, non-germinal center type


## Create HDF5 file for each TMA

In [14]:
def tma_to_hdf5_file(tma_id, tma_annotations, tma_slide):
    tma_hdf5_filename = f'tma{tma_id}.hdf5'
    if os.path.exists(tma_hdf5_filename):
        os.remove(tma_hdf5_filename)
    f = h5py.File(tma_hdf5_filename, "w")
    patient_ids = set()
    
    patient_id_repeats = {}
    
    for index, row in tma_annotations.iterrows():
        patient_id = row["Name"]
        
        if not isinstance(patient_id, str):
            continue
        
        name = patient_id
        
        who_diagnosis = get_field_by_patient_id(tma_id, patient_id, WHO_DIAGNOSIS)
        clpa_diagnosis = get_field_by_patient_id(tma_id, patient_id, CLPA_DIAGNOSIS)
        label = get_field_by_patient_id(tma_id, patient_id, LABEL)
        
        if who_diagnosis == None:
            continue
        
        # Deal with duplicate patients
        if (patient_id not in patient_ids):
            patient_id_repeats[patient_id] = 0

        patient_id_repeats[patient_id] += 1
        name += f"_v{patient_id_repeats[patient_id]}"
            
        xs, ys, width, height = int(row["X"]), int(row["Y"]), int(row["Width"]), int(row["Height"])
        xe, ye = xs + width, ys + height
        patches = get_patches_from_core(tma_slide, xs, ys, xe, ye)
        
        if patches.size == 0:
            print(f"No patches found for TMA: {tma_id}, Patient: {patient_id}")
            continue
        
        dset = f.create_dataset(name, data=patches, dtype='uint8')
        dset.attrs['tma_id'] = tma_id
        dset.attrs['patient_id'] = patient_id
        dset.attrs['who_diagnosis'] = who_diagnosis
        dset.attrs['clpa_diagnosis'] = clpa_diagnosis
        dset.attrs['label'] = label
        patient_ids.add(patient_id)
        
    f.close()

def extract_raw_data_to_hdf5(tma_ids, tma_slides, tma_annotations):
    assert(len(tma_slides) == len(tma_annotations))
    for i in range(len(tma_slides)):
        tma_id = tma_ids[i]
        print(f"Creating HDF5 file for TMA: {tma_id}\n")
        tma_to_hdf5_file(tma_id, tma_annotations[i], tma_slides[i])

extract_raw_data_to_hdf5(tma_ids, tma_slides, tma_annotations)

Creating HDF5 file for TMA: 1

Could not find 2017 WHO DIAGNOSIS for: placenta
Could not find CLPA Diagnostic Bin for: placenta
Could not find label for: placenta
Could not find 2017 WHO DIAGNOSIS for: tonsil
Could not find CLPA Diagnostic Bin for: tonsil
Could not find label for: tonsil
Could not find 2017 WHO DIAGNOSIS for: tonsil
Could not find CLPA Diagnostic Bin for: tonsil
Could not find label for: tonsil
Could not find 2017 WHO DIAGNOSIS for: placenta
Could not find CLPA Diagnostic Bin for: placenta
Could not find label for: placenta
Creating HDF5 file for TMA: 2

Could not find 2017 WHO DIAGNOSIS for: tonsil
Could not find CLPA Diagnostic Bin for: tonsil
Could not find label for: tonsil
Could not find 2017 WHO DIAGNOSIS for: tonsil
Could not find CLPA Diagnostic Bin for: tonsil
Could not find label for: tonsil
Could not find 2017 WHO DIAGNOSIS for: placenta
Could not find CLPA Diagnostic Bin for: placenta
Could not find label for: placenta
Could not find 2017 WHO DIAGNOSIS for:

OSError: Can't prepare for writing data (file write failed: time = Sun May  1 15:41:07 2022
, filename = 'tma2.hdf5', file descriptor = 58, errno = 5, error message = 'Input/output error', buf = 0x55c26fab79d0, total write size = 12644352, bytes this sub-write = 12644352, bytes actually written = 18446744073709551615, offset = 2886532352)