# Comparison Tool: database-dl & v-patients

This notebook checks, starting from v-patients, if database-dl contains all information. Additionally, it checks if the data in v-patients is usable for Deep Learning.

In [1]:
import os
import pandas as pd
import re
import torchio as tio
import numpy as np
from py_topping.data_connection.sharepoint import lazy_SP365

In [2]:
root_dir_vpatients = "/home/database/v-patients/"
database_dl_path = "/home/database/database-dl.xlsx"

sharepoint = lazy_SP365(
            site_url="https://virtonomyio.sharepoint.com/sites/DeepLearningLionel/",
            client_id="9192d1fd-3fd7-4f57-88be-f3b4b3d151b0",
            client_secret="g1Cmci7MjxRCBYV+1ZJ2c+SvS9SX8R0L6ogkcHZxg8U=")

sharepoint.download(sharepoint_location="/sites/DeepLearningLionel/Freigegebene Dokumente/Data Annotation/database-dl.xlsx",
            local_location=database_dl_path)

Authen OK
Download OK File


In [3]:
with open(database_dl_path, mode="rb") as excel_file:
            data = pd.read_excel(excel_file)
        
data = data.where(pd.notnull(data), None)
data = data.replace({np.nan: None})
patients = sorted(os.listdir(root_dir_vpatients))

In [4]:
patients

['00001',
 '00002',
 '00003',
 '00004',
 '00005',
 '00006',
 '00007',
 '00008',
 '00010',
 '00026',
 '00047',
 '00092',
 '00097',
 '00100',
 '00104',
 '00106',
 '00110',
 '00119',
 '00121',
 '00122',
 '00123',
 '00124',
 '00125',
 '00126',
 '00127',
 '00128',
 '00129',
 '00130',
 '00131',
 '00132',
 '00133',
 '00136',
 '00137',
 '00146',
 '00154',
 '00171',
 '00172',
 '00173',
 '00174',
 '00175',
 '00177',
 '00178',
 '00180',
 '00181',
 '00182',
 '00183',
 '00184',
 '00185',
 '00186',
 '00187',
 '00188',
 '00189',
 '00190',
 '00191',
 '00192',
 '00193',
 '00194',
 '00195',
 '00196',
 '00197',
 '00198',
 '00199',
 '00201',
 '00203',
 '00262',
 '00263',
 '00340',
 '00353',
 '00360',
 '00379',
 '00504',
 '00617',
 '00618',
 '00663',
 '00667',
 '00673',
 '00675',
 '00681',
 '00692',
 '00693',
 '00702',
 '00707',
 '00708',
 '00719',
 '00720',
 '00721',
 '00722',
 '00723',
 '00726',
 '00729',
 '00730',
 '00732',
 '00735',
 '00740',
 '00741',
 '00743',
 '00746',
 '00747',
 '00748',
 '00749',


In [5]:
data

Unnamed: 0,PatientID,Image Modality,ImageRegion,Series,Phase,Timestamp,WholeHeart,WholeAorta,Thorax,Spine,Diaphragm,RA_SVC_IVC,Annotator,Cropped,ContrastLeftHeart,ContrastRightHeart,Notes
0,0.0,CT,chest,3,,30.0,,,,,,,,,high,medium,implants
1,1.0,CT,head to pelvis,5,ED,,yes,yes,,,,,Mahshad,yes,high,high,"femoral implant, significant aorta calcifications"
2,2.0,CT,head to pelvis,5,ED,,yes,yes,,,,,Mahshad,yes,high,low,"pacemaker, aortic valve replacement"
3,3.0,CT,head to pelvis,5,ED,,yes,yes,,,,,diogo,yes,high,medium,aortic valve implant
4,4.0,CT,head to pelvis,5,ED,,yes,yes,,,,,Mahshad,yes,medium,medium,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1763,948.0,CT,chest,5,ES,0.0,yes,yes,,,yes,yes,,,medium,high,medira-P02
1764,948.0,CT,chest,5,ED,100.0,yes,yes,,,yes,yes,,,medium,high,medira-P02
1765,949.0,CT,heart,6,ED,60.0,,no,no,no,,,,,medium,medium,"BerlinHeart Pilot Patient-3, Univentricular"
1766,949.0,CT,chest,7,ED,60.0,,,,,,,,,medium,medium,"BerlinHeart Pilot Patient-3, Univentricular"


In [23]:
serious_problems = 0
soft_problems = 0


serious_problems_only = True
phase_problems = False


for patient in patients:
    dl_files = os.listdir(root_dir_vpatients + patient + "/dl-data/")
    for file in dl_files:
        if "label" not in file:
            continue            
        if "label" in file:
            if file == "pat822-ser301-two_hearts-label.nii.gz":
                continue
            if file == "pat941-ser005-uv_heart-label.nii.gz":
                continue
            if file == "pat942-ser3002-uv_heart-label.nii.gz":
                continue
            # find all numbers in file name: patient_id, series_number, time_frame
            x = re.findall('[0-9]+', file)
            # get correct row by first selecting patient_id, series number and Timestamp
            dl_row = data.loc[data["PatientID"] == int(x[0].lstrip('0'))]
            dl_row = dl_row.loc[data["Series"] == int(x[1].lstrip('0'))]
            # not all files models do have timestamp
            if len(x) > 2:
                if x[2] == "000":
                    dl_row = dl_row.loc[data["Timestamp"] == 0.0]
                else:
                    dl_row = dl_row.loc[data["Timestamp"] == int(x[2].lstrip('0'))]
            else:
                if dl_row.shape[0] > 0 and dl_row.iloc[0]["Timestamp"] is not None:
                    print(patient + "has no frame information, but there is an entry in database-dl")
                    serious_problems += 1
                    
            # if the series number doesnt correspond, dl_row will be empty (353 has ser_merged)
            if dl_row.shape[0] == 0:
                print("Please check series and frame number for: "+ file)
                serious_problems += 1
                continue
                
            # check general entries in row
            if dl_row.iloc[0]["Image Modality"] is None:
                soft_problems += 1
                if not serious_problems_only:
                    print("File: " + file + " is missing Image Modality entry")
            if dl_row.iloc[0]["ImageRegion"] is None:
                print("File: " + file + " is missing ImageRegion entry")
                serious_problems += 1
            if dl_row.iloc[0]["Phase"] is None:
                if phase_problems:
                    serious_problems += 1
                    print("File: " + file + " is missing Phase entry")

                
            # heart specifc information
            if "heart" in file:
                if dl_row.iloc[0]["WholeHeart"] is None:
                    print("File: " + file + " is missing WholeHeart entry")
                    serious_problems += 1
                if dl_row.iloc[0]["WholeAorta"] is None:
                    soft_problems += 1
                    if not serious_problems_only:
                        print("File: " + file + " is missing WholeAorta entry")
                if dl_row.iloc[0]["ContrastLeftHeart"] is None:
                    soft_problems += 1
                    if not serious_problems_only:
                        print("File: " + file + " is missing ContrastLeftHeart entry")
                if dl_row.iloc[0]["ContrastRightHeart"] is None:
                    soft_problems += 1
                    if not serious_problems_only:
                        print("File: " + file + " is missing ContrastRightHeart entry")
                if "cropped" in file and dl_row.iloc[0]["Cropped"] != "yes":
                    soft_problems += 1
                    if not serious_problems_only:
                        print("File: " + file + " is missing Cropped entry")
                    
            # thorax specific information
            if "thorax" in file:
                if dl_row.iloc[0]["Thorax"] is None:
                    print("File: " + file + " is missing Thorax entry")
                    serious_problems += 1
                    
            # RA_SVC_IVC specific information
            if "RA_SVC_IVC" in file:
                if dl_row.iloc[0]["RA_SVC_IVC"] is None:
                    print("File: " + file + " is missing RA_SVC_IVC entry")
                    serious_problems += 1
            
            #####################################################################################################        
            # now check if the data is usable for deep learning purposes
            image_name = file.replace("-label", "")
            if "heart" in image_name:
                image_name = image_name.replace("-heart", "")
            if "thorax" in image_name:
                image_name = image_name.replace("-thorax", "")
            if "RA_SVC_IVC_CS" in image_name:
                image_name = image_name.replace("-RA_SVC_IVC_CS", "")
            if "RA_SVC_IVC" in image_name:
                image_name = image_name.replace("-RA_SVC_IVC", "")

                
            image = tio.ScalarImage(root_dir_vpatients + patient + "/dl-data/" + image_name)
            labelmap = tio.LabelMap(root_dir_vpatients + patient + "/dl-data/" + file)
            
            try:
                if (image.affine != labelmap.affine).any():
                    print("Affine matrices do not match for file: " + file)
                    serious_problems += 1
                if image.shape != labelmap.shape:
                    print("Labelmap and Image do not have the same size for file: " + file) 
                    serious_problems += 1
            except:
                print("Could not compare image and labelmap for file " + file)
                serious_problems += 1
            ##################################################################################################### 

                            
print("")                           
print("Found " + str(serious_problems) + " serious problems in v-patients/database-dl.")
print("Found " + str(soft_problems) + " soft problems in v-patients/database-dl.")

Please check series and frame number for: pat353-ser_merged-frame080-heart-label.nii.gz
Please check series and frame number for: pat353-ser_merged-frame030-heart-label.nii.gz
Please check series and frame number for: pat353-ser_merged-frame080-thorax-label.nii.gz
Please check series and frame number for: pat353-ser_merged-frame030-thorax-label.nii.gz
File: pat707-ser301-frame000-heart-label.nii.gz is missing WholeHeart entry
Affine matrices do not match for file: pat720-ser004-frame078-RA_SVC_IVC_CS-label.nii.gz
Labelmap and Image do not have the same size for file: pat720-ser004-frame078-RA_SVC_IVC_CS-label.nii.gz
Affine matrices do not match for file: pat722-ser004-frame085-RA_SVC_IVC_CS-label.nii.gz
Labelmap and Image do not have the same size for file: pat722-ser004-frame085-RA_SVC_IVC_CS-label.nii.gz
File: pat840-ser009-RA_SVC_IVC-label.nii.gz is missing RA_SVC_IVC entry
File: pat842-ser014-RA_SVC_IVC-label.nii.gz is missing RA_SVC_IVC entry
File: pat843-ser009-RA_SVC_IVC-label.n

### An additional script to check if RA_SVC_Labelmaps are ok

In [29]:

patients = sorted(os.listdir(root_dir_vpatients))

valid = []
for patient in patients:
    files = os.listdir(os.path.join(root_dir_vpatients, patient, "dec_models"))
    for file in files:
        if "-RA_SVC" in file:
            if file == "pat692-ser303-frame075-RA_SVC_IVC.stl":
                continue
            try:
                labelmap = tio.LabelMap(root_dir_vpatients + patient + "/dl-data/" + file.replace(".stl", "-label.nii.gz"))[tio.DATA]
                if len(np.unique(labelmap)) == 5:
                    print("RA_SVC_IVC not in labelmap for file " + file)        
            except:
                print("cant load labelmap for file " + file)

cant load labelmap for file pat707-ser301-frame000-RA_SVC_IVC_CS.stl
cant load labelmap for file pat732-ser007-frame075-RA_SVC_IVC_CS.stl
cant load labelmap for file pat746-ser011-frame075-RA_SVC_IVC.stl
cant load labelmap for file pat947-ser005-frame079-RA_SVC_IVC.stl
cant load labelmap for file pat947-ser006-frame032-RA_SVC_IVC.stl
cant load labelmap for file pat948-ser005-frame000-RA_SVC_IVC.stl
cant load labelmap for file pat948-ser005-frame100-RA_SVC_IVC.stl
