# Read excel files

In [1]:
import os
import re
import glob
import numpy as np
import pandas as pd
from pydicom.filereader import dcmread

In this notebook we show how to read the excel files containing all the information of the GT segmentations.

In [2]:
output_folder = "../../lvnc-dataset/original-data"
min_score = 4

In [3]:
# excel_scores = "../../lvnc-dataset/resumen-datos-qlvthc.xlsx"
# excel_outputs = "../../lvnc-dataset/salidas-datos-qlvthc.xlsx"
excel_scores = "../../lvnc-dataset/resumen-datos-npP1-P307-v3.xlsx"
excel_outputs = "../../lvnc-dataset/salidas-datosP1-P307-v3.xlsx"

We define the criteria to discard or keep a given slice:

In [4]:
def convert_cell(x, first_char: str):
    """
    Function to process and transform the cells of the column containing the patients or the slices
    """
    if isinstance(x, str):
        if len(x)>0:
            if x[0]==first_char:
                if "®" in x:
                    return x.split(" ")[0] + "-REVERSED"
                return x.split(" ")[0]
    return np.NaN   

In [5]:
def read_excel_scores(sheet_name: str, first_char_patient: str):
    df_excel = pd.read_excel(excel_scores, sheet_name=sheet_name, skiprows=17, usecols="A,C,D",
                             header=None, names=['patient', 'slice', 'score'], #index_col=[0,1],
                             converters={
                                 'patient': lambda x: convert_cell(x, first_char_patient),
                                 'slice': lambda x: convert_cell(x, "c")
                             })#.reset_index()
    df_excel['patient'] = df_excel['patient'].ffill()
    df_excel = df_excel.dropna(subset=['slice'])
    print(f"Total number of slices: {len(df_excel)}")
    df_excel["slice"] = df_excel["slice"].apply(lambda x: int(x[1:])) # Remove initial 'c'
    # Invert the order of "reversed" slices
    aux_dfs = []
    for patient, group in df_excel.groupby('patient'):
        if "-REVERSED" in patient:
            n_patient = patient.replace("-REVERSED", "")
            max_slice = max(group["slice"])
            group["patient"]=n_patient
            #group["slice"]=group["slice"].apply(lambda x: 1 + max_slice - x)
            #group.sort_values(by=["slice"], inplace=True)
            group["reversed"] = True
        else:
            group["reversed"] = False
            
        aux_dfs.append(group)
    
    df_excel = pd.concat(aux_dfs)
        
    # Remove slices with NaN or non-numeric score
    # https://stackoverflow.com/questions/55722919/python-how-to-drop-all-the-non-numeric-values-from-a-pandas-column
    df_excel = df_excel[pd.to_numeric(df_excel['score'],errors='coerce').notna()]
    print(f"Slices with not NaN score: {len(df_excel)}")
    return df_excel 

In [6]:
def read_excel_outputs(sheet_name: str, first_char_patient: str):
    df_excel = pd.read_excel(excel_outputs, sheet_name=sheet_name, skiprows=16, 
                             usecols="A,B,E", header=None, names=['patient', 'slice', 'pta'],
                             converters={
                                 'pta': lambda x: x.split(" ")[0] if isinstance(x, str) else 100*x,
                                 'patient': lambda x: convert_cell(x, first_char_patient),
                                 'slice': lambda x: convert_cell(x, "c")
                             })
    df_excel['patient'] = df_excel['patient'].ffill()
    df_excel = df_excel.dropna(subset=['slice'])
    print(f"Total number of slices: {len(df_excel)}")
    df_excel["slice"] = df_excel["slice"].apply(lambda x: int(x[1:])) # Remove initial 'c'
    # Remove slices with NaN PTA
    df_excel = df_excel.dropna(subset=['pta'])
    print(f"Slices with not NaN PTA: {len(df_excel)}")
    return df_excel

## Patients P

In [7]:
print("Reading excel with scores")
df_scores_p = read_excel_scores("P1-P307", "P").set_index(["patient", "slice"])
print("Reading excel with PTA%")
df_output_p = read_excel_outputs("P1-P307", "P").set_index(["patient", "slice"])
#df_p = pd.merge(df_scores_p, df_scores_p, on=['patient','slice'], how='outer', indicator=True)
df_p = pd.merge(df_scores_p, df_output_p, on=['patient','slice'], how='outer', indicator=True)
df_p["set"] = "HCM"

Reading excel with scores
Total number of slices: 2939
Slices with not NaN score: 2438
Reading excel with PTA%
Total number of slices: 2934
Slices with not NaN PTA: 2452


In [8]:
df_p

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,4,False,48.66,both,HCM
P1,2,5,False,31.42,both,HCM
P1,3,5,False,27.58,both,HCM
P1,4,4,False,31.57,both,HCM
P1,5,5,False,33.27,both,HCM
...,...,...,...,...,...,...
P251,5,,,2036,right_only,HCM
P251,6,,,2226,right_only,HCM
P260,7,,,2829,right_only,HCM
P260,12,,,3915,right_only,HCM


Data per patient:

In [9]:
df_p.groupby('patient').count()

Unnamed: 0_level_0,score,reversed,pta,_merge,set
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,7,7,7,7,7
P10,9,9,9,9,9
P100,6,6,6,6,6
P101,8,8,8,8,8
P102,7,7,7,7,7
...,...,...,...,...,...
P95,8,8,8,8,8
P96,8,8,8,8,8
P97,9,9,9,9,9
P98,9,9,8,9,9


Some slices are present in one of the excel files but not in the other. That is, some slices have PTA but no score or viceversa... It looks like a problem reading the excel file.

In [10]:
print(sum(df_p['_merge'] == 'right_only'))
print(sum(df_p['_merge'] == 'left_only'))

22
8


In [11]:
df_p.loc[df_p['_merge'] == 'right_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P5,10,,,41.98,right_only,HCM
P6,1,,,52.14,right_only,HCM
P29,3,,,7.01,right_only,HCM
P56,10,,,7.22,right_only,HCM
P61,1,,,34.47,right_only,HCM
P67,1,,,22.6,right_only,HCM
P250,1,,,3804.0,right_only,HCM
P250,4,,,5860.0,right_only,HCM
P250,5,,,4261.0,right_only,HCM
P250,6,,,4030.0,right_only,HCM


In [12]:
df_p.loc[df_p['_merge'] == 'left_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P152,1,5,False,,left_only,HCM
P17,1,5,False,,left_only,HCM
P18,1,5,False,,left_only,HCM
P24,1,5,False,,left_only,HCM
P25,1,5,False,,left_only,HCM
P27,10,5,False,,left_only,HCM
P277,13,5,False,,left_only,HCM
P98,1,5,False,,left_only,HCM


We need to discard some slices or add their score based on other data not available in the excel files we are using:

In [13]:
# Discard all the slices for which we do not have PTA. The score for this slices appears due to an error.
df_p = df_p[df_p['_merge']!='left_only']
# Discard some slices for which we only have the PTA score
df_p = df_p.drop(["P250", "P251", "P260"], level='patient', errors='ignore')
# Manually set other values
df_p.loc[('P5', 10), 'score'] = 5
df_p.loc[('P6', 1), 'score'] = 5
df_p.loc[('P29', 3), 'score'] = 5
df_p.loc[('P56', 10), 'score'] = 5
df_p.loc[('P61', 1), 'score'] = 4
df_p.loc[('P67', 1), 'score'] = 5

# Now we have to solve some issues due to excel creation problems


In [14]:
df_p.loc[df_p['_merge'] == 'right_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P5,10,5,,41.98,right_only,HCM
P6,1,5,,52.14,right_only,HCM
P29,3,5,,7.01,right_only,HCM
P56,10,5,,7.22,right_only,HCM
P61,1,4,,34.47,right_only,HCM
P67,1,5,,22.6,right_only,HCM


In [15]:
df_p.loc[pd.IndexSlice["P48", :], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P48,1,5,False,29.72,both,HCM
P48,2,5,False,27.95,both,HCM
P48,3,5,False,30.96,both,HCM
P48,4,5,False,30.29,both,HCM
P48,5,5,False,26.38,both,HCM
P48,6,5,False,28.2,both,HCM


## Patients X

In [16]:
print("Reading excel with scores")
df_scores_x = read_excel_scores("X1-X67", "X").set_index(["patient", "slice"])
print("Reading excel with PTA%")
df_output_x = read_excel_outputs("X1-X67", "X").set_index(["patient", "slice"])
df_x = pd.merge(df_scores_x, df_output_x, on=['patient','slice'], how='outer', indicator=True)
df_x["set"]="X"

Reading excel with scores
Total number of slices: 564
Slices with not NaN score: 470
Reading excel with PTA%
Total number of slices: 667
Slices with not NaN PTA: 470


In [17]:
df_x

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
X1,1,5.0,False,37.73,both,X
X1,2,5.0,False,33.88,both,X
X1,3,5.0,False,31.78,both,X
X1,4,5.0,False,36.64,both,X
X1,5,5.0,False,38.80,both,X
...,...,...,...,...,...,...
X9,4,5.0,False,35.32,both,X
X9,5,5.0,False,30.86,both,X
X9,6,5.0,False,26.88,both,X
X9,7,5.0,False,25.68,both,X


Data per patient

In [18]:
df_x.groupby('patient').count()

Unnamed: 0_level_0,score,reversed,pta,_merge,set
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X1,7,7,7,7,7
X11,10,10,10,10,10
X12,8,8,8,8,8
X14,9,9,9,9,9
X15,8,8,8,8,8
X16,8,8,8,8,8
X17,8,8,8,8,8
X18,7,7,7,7,7
X19,8,8,8,8,8
X2,10,10,10,10,10


In [19]:
print(sum(df_x['_merge'] == 'right_only'))
print(sum(df_x['_merge'] == 'left_only'))

0
0


In [20]:
df_x[df_x['_merge'] == 'left_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


We need to discard some slices or add their score based on other data not available in the excel files we are using:

In [21]:
df_x[df_x.isna().any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


## Patients ValldHebron

In [22]:
print("Reading excel with scores")
df_scores_h = read_excel_scores("ValldHbron", "P")
print("Reading excel with PTA%")
df_output_h = read_excel_outputs("ValldHbron", "P")
df_h = pd.merge(df_scores_h, df_output_h, on=['patient','slice'], how='outer', indicator=True)
df_h["set"]="Hebron"
df_h["patient"]=df_h["patient"].apply(lambda x: x.split("-")[1])
df_h = df_h.set_index(["patient", "slice"])

Reading excel with scores
Total number of slices: 283
Slices with not NaN score: 198
Reading excel with PTA%
Total number of slices: 292
Slices with not NaN PTA: 199


In [23]:
df_h

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANH,4,5,False,4692,both,Hebron
ANH,5,5,False,4933,both,Hebron
ANH,6,4.5,False,3939,both,Hebron
ANH,7,4.5,False,3708,both,Hebron
ANH,8,5,False,3522,both,Hebron
...,...,...,...,...,...,...
VPM,4,5,False,3824,both,Hebron
VPM,5,5,False,2197,both,Hebron
VPM,6,5,False,2618,both,Hebron
VPM,7,5,False,2349,both,Hebron


Data per patient

In [24]:
df_h.groupby('patient').count()

Unnamed: 0_level_0,score,reversed,pta,_merge,set
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANH,6,6,6,6,6
BPM,8,8,8,8,8
CAC,11,11,11,11,11
CAP,7,7,7,7,7
CHG,8,8,8,8,8
DGA,10,10,10,10,10
FAAJ,9,9,9,9,9
FDC,9,9,9,9,9
FGD,6,6,6,6,6
GAB,6,6,6,6,6


In [25]:
print(sum(df_h['_merge'] == 'right_only'))
print(sum(df_h['_merge'] == 'left_only'))

1
0


In [26]:
df_h = df_h.drop(("MJGV", 6), errors='ignore')
df_h.loc[df_h['_merge'] == 'right_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [27]:
df_h.loc[df_h['_merge'] == 'left_only']

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,_merge,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


## Creating a single dataframe

In [28]:
df_excel = pd.concat([df_p, df_x, df_h])
print(f"Number of slices with possible NaN values: {len(df_excel)}")
df_excel.sort_values(by=["set", "patient", "slice"], inplace=True)
df_excel["reversed"]=df_excel.groupby("patient")["reversed"].transform(lambda x: x.any())
# TODO: esto puede fallar df_excel['reversed'] = df_excel['reversed'].ffill()
# We discard slices for which some information is lacking
df_excel = df_excel.dropna()
print(f"Number of slices: {len(df_excel)}")
# We discard slices with scores below the given threshold
df_excel = df_excel[df_excel["score"]>=min_score]
print(f"Number of slices with score greater than or equal to {min_score}: {len(df_excel)}")

Number of slices with possible NaN values: 3104
Number of slices: 3104
Number of slices with score greater than or equal to 4: 3098


In [29]:
del df_excel["_merge"]
df_excel

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,4,False,48.66,HCM
P1,2,5,False,31.42,HCM
P1,3,5,False,27.58,HCM
P1,4,4,False,31.57,HCM
P1,5,5,False,33.27,HCM
...,...,...,...,...,...
X9,4,5,False,35.32,X
X9,5,5,False,30.86,X
X9,6,5,False,26.88,X
X9,7,5,False,25.68,X


## Iterate over data folders

**Important**: given that the folder "MLSV" does not exist in the "Hebron" set, we have assumed that "MLOSV"="MLSV". Thus, we have changed the name of the folder

In [30]:
original_data_folder = "../../lvnc-dataset/original-data"

In [31]:
def get_dicom_paths(patient_folder: str, set_name: str, patient_name: str):
    """

    :param exec_file: Path to "Contador.exe" file
    :param patient_folder:
    :param set_name: Name to identify the set images that is being processed. It would be added to the generated DataFrame
    :param patient_name: 
    :return: Dataframe in which each row contains the dicam path of a given patient and slice
    """
    
    if set_name == "HCM":
        regex_slice_number = r"(.*)I(?P<slice>\d{1,2})(.*)\.dcm"
    elif set_name=="X":
        regex_slice_number = r"X(?P<patient>\d{1,2})I(?P<slice>\d{1,2})(.*)\.dcm"
    elif set_name=="Hebron":
        regex_slice_number = r"(.*)(?P<slice>\d{2})\.dcm"

    results = []
    # Iterate over files in the patients folder
    files = os.listdir(patient_folder)
    for filename in files:
        if filename.endswith(".dcm"):
            if filename.replace(".dcm", "Params.txt") in files:
                re_file_match = re.search(regex_slice_number, filename)
                if re_file_match:
                    slc = int(re_file_match.group("slice"))
                    file_abs_path = os.path.abspath(os.path.join(patient_folder, filename))

                    new_record = {
                        "patient": patient_name,
                        "slice": slc,
                        "set": set_name,
                        "dicom_path": file_abs_path
                    }
                    results.append(new_record)

                else:
                    # print("File {} does not match the indicated regex".format(filename))
                    pass
            else:
                # print("Output file {} could not be found".format(filename.replace(".dcm", "Sal.jpg")))
                pass
    if results:
        return pd.DataFrame(results).set_index(["patient", "slice"])
    else:
        return None

Rename `X51`, `P1` files to avoid problems:

In [32]:
x51_folder = os.path.join(original_data_folder, "X/X51")
offset = 12

files_x51 = os.listdir(x51_folder)
for f in files_x51:
    if "IMG" in f:
        str_split = f.split("_")
        pref = str_split[0]
        suf = str_split[1]
        num_slice = int(pref.split("-")[1]) - offset
        os.rename(os.path.join(x51_folder, f), os.path.join(x51_folder, f"X51I{num_slice}_{suf}"))

In [33]:
p1_folder = os.path.join(original_data_folder, "HCM/P1")

files_p1 = os.listdir(p1_folder)
for f in files_p1:
    new_name = f.replace("img800x800", "")
    os.rename(os.path.join(p1_folder, f), os.path.join(p1_folder, new_name))

for i in [2, 3, 6, 15, 22]:
    pi_folder = os.path.join(original_data_folder, f"HCM/P{i}")
    files_pi = os.listdir(pi_folder)
    for f in files_pi:
        new_name = f.replace("img", "")
        os.rename(os.path.join(pi_folder, f), os.path.join(pi_folder, new_name))

In [34]:
df_paths_aux = []

for set_name,group in df_excel.groupby('set'):
    print(f"Processing set {set_name}")
    set_folder = os.path.join(original_data_folder, set_name)
    patient_folders = [ item for item in os.listdir(set_folder) if os.path.isdir(os.path.join(set_folder, item)) ]
    for patient, slices in group.groupby('patient'):
        #print(f"-- Processing patient {patient}")
        candidates = [i for i in patient_folders if i.startswith(patient)]
        if len(candidates)==0:
            print("E: Folder not found")
            continue
        elif len(candidates)==1:
            patient_folder = os.path.join(set_folder, candidates[0])
        elif (patient+"bis") in candidates:
            patient_folder = os.path.join(set_folder, patient+"bis")
            dicom_files = get_dicom_paths(patient_folder, set_name, patient)
            if dicom_files is None:
                patient_folder = os.path.join(set_folder, patient)
        elif patient in candidates:
            patient_folder = os.path.join(set_folder, patient)
        
        # Get the list of files in the patient's folder
        dicom_files = get_dicom_paths(patient_folder, set_name, patient)
        try:
            df_merged = pd.merge(slices, dicom_files, on=['patient','slice'], how='outer', indicator=True)
            df_paths_aux.append(df_merged)
            #print(df_merged)
        except Exception as e:
            slices['dicom_path']=patient_folder
            df_paths_aux.append(slices)
            print(f"-- Patient {patient}")
            print(e)

df_paths = pd.concat(df_paths_aux)

Processing set HCM
Processing set Hebron
Processing set X


In [35]:
df_paths

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,set_x,set_y,dicom_path,_merge
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P1,1,4,False,48.66,HCM,HCM,/root/lvnc-dataset/original-data/HCM/P1/I1.dcm,both
P1,2,5,False,31.42,HCM,HCM,/root/lvnc-dataset/original-data/HCM/P1/I2.dcm,both
P1,3,5,False,27.58,HCM,HCM,/root/lvnc-dataset/original-data/HCM/P1/I3.dcm,both
P1,4,4,False,31.57,HCM,HCM,/root/lvnc-dataset/original-data/HCM/P1/I4.dcm,both
P1,5,5,False,33.27,HCM,HCM,/root/lvnc-dataset/original-data/HCM/P1/I5.dcm,both
...,...,...,...,...,...,...,...,...
X9,4,5,False,35.32,X,X,/root/lvnc-dataset/original-data/X/X9/X09I4.dcm,both
X9,5,5,False,30.86,X,X,/root/lvnc-dataset/original-data/X/X9/X09I5.dcm,both
X9,6,5,False,26.88,X,X,/root/lvnc-dataset/original-data/X/X9/X09I6.dcm,both
X9,7,5,False,25.68,X,X,/root/lvnc-dataset/original-data/X/X9/X09I7.dcm,both


There are 67 slices that are considered to be valid in the excel files but that are not present in the data folder:

In [36]:
print(sum(df_paths['_merge'] == 'left_only'))

0


In [37]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    print(df_paths[df_paths['_merge'] == 'left_only']["score"])

Series([], Name: score, dtype: object)


In [38]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(len(df_paths[df_paths['_merge'] == 'right_only']))
    print(df_paths[df_paths['_merge'] == 'right_only']["score"])

21
patient  slice
P158     11       NaN
P160     11       NaN
P162     10       NaN
         11       NaN
P165     9        NaN
P171     14       NaN
P172     11       NaN
         12       NaN
         13       NaN
         14       NaN
P191     10       NaN
P232     1        NaN
P34      1        NaN
P38      1        NaN
P50      6        NaN
P54      10       NaN
         9        NaN
         11       NaN
P72      4        NaN
MHSML    2        NaN
MJGV     6        NaN
Name: score, dtype: object


In [39]:
df_paths = df_paths[df_paths["_merge"]=="both"]
df_paths["set"] = df_paths["set_x"]
del df_paths["_merge"]
del df_paths["set_x"]
del df_paths["set_y"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paths["set"] = df_paths["set_x"]


In [40]:
df_paths

Unnamed: 0_level_0,Unnamed: 1_level_0,score,reversed,pta,dicom_path,set
patient,slice,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,4,False,48.66,/root/lvnc-dataset/original-data/HCM/P1/I1.dcm,HCM
P1,2,5,False,31.42,/root/lvnc-dataset/original-data/HCM/P1/I2.dcm,HCM
P1,3,5,False,27.58,/root/lvnc-dataset/original-data/HCM/P1/I3.dcm,HCM
P1,4,4,False,31.57,/root/lvnc-dataset/original-data/HCM/P1/I4.dcm,HCM
P1,5,5,False,33.27,/root/lvnc-dataset/original-data/HCM/P1/I5.dcm,HCM
...,...,...,...,...,...,...
X9,4,5,False,35.32,/root/lvnc-dataset/original-data/X/X9/X09I4.dcm,X
X9,5,5,False,30.86,/root/lvnc-dataset/original-data/X/X9/X09I5.dcm,X
X9,6,5,False,26.88,/root/lvnc-dataset/original-data/X/X9/X09I6.dcm,X
X9,7,5,False,25.68,/root/lvnc-dataset/original-data/X/X9/X09I7.dcm,X


## Save final dataframe

In [41]:
df_paths.to_pickle(os.path.join(output_folder, "df_excel.pick"))