## Creating a dataframe with ID -> Label

This file takes a directory and processes the files in the given format, creating a csv file.


Imports

In [2]:
import os
import pandas as pd
import pickle

# Export dataframe as .csv if only ID -> Label, .pkl otherwise

In [4]:
# Setting variables 

root_dir = 'X:\\RafaelAndre\\MedGIFT\\ILD_DB_txtROIs'

#root_dir = 'C:\\Users\\compe\\Desktop\\Main'

In [5]:
# Function to correctly update dictionary

def update_feature_label(featureLabel, id, num):
    # If key exists and is 0, update
    # otherwise, create the key

    if featureLabel.get(id, 0) == 0:
        featureLabel[id] = num

In [6]:
# Identify every folder available

folderIdList = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
print(folderIdList)

# Removing validation folder and applying the same logic is important

['101', '105', '107', '108', '109', '112', '116', '118', '119', '12', '120', '121', '122', '123', '124', '126', '127', '128', '129', '130', '131', '132', '134', '135', '136', '137', '138', '140', '142', '143', '144', '147', '149', '150', '152', '153', '154', '155', '157', '158', '159', '160', '162', '163', '164', '165', '166', '167', '168', '169', '17', '171', '172', '173', '174', '175', '177', '179', '180', '181', '182', '183', '184', '185', '19', '21', '23', '3', '34', '35', '36', '37', '38', '39', '41', '45', '46', '47', '48', '51', '53', '56', '57', '62', '65', '66', '7', '70', '74', '76', '77', '78', '8', '80', '81', '82', '83', '84', '89', '90', '92', '94', 'HRCT_pilot']


In [7]:
# File analysis loop

def iterateFiles(folderIdList, featureLabel, verbose=0):

    for patientID in folderIdList:
        for dirpath, dirnames, filenames in os.walk(os.path.join(root_dir, patientID)):
            # Open MedGIFT / txtROI
            if verbose: print(f"Current directory ID: {patientID}")
        
            # For cases where the code detects nested folders, we want only patient ID, no matter the scan
            # For that reason (although inneficient), I will be separating those cases into 2 loops
            # This is why we have a nested loop 
        
            # For each folder create id (ex patientID = folder number)
            for file in filenames:
                # Iterate through files in order to find .txt
                if 'txt' in file:
                    fileID = f"{file}"
                    if verbose: print(f"Selected: {fileID}")
        
                    # Define path for access
                    file_path = os.path.join(dirpath,file)
                    with open(file_path, 'r') as file:
                        content = file.read()
        
                        # If it contains keyword "fibrosis" set label to 1, otherwise 0 
                        update_feature_label(featureLabel, patientID, ((1) if "fibrosis" in content else (0)))
    
    return featureLabel

In [8]:
# Build dictionary (guarantees correct assignment)

featureLabel = {}

validationFolder = "val"

featureLabel = iterateFiles(folderIdList, featureLabel, 1)


Current directory ID: 101
Selected: CT-HR-0002.txt
Current directory ID: 105
Selected: CT-INSPIRIUM-5662.txt
Current directory ID: 107
Selected: CT-INSPIRIUM-3684.txt
Current directory ID: 108
Selected: CT-INSPIRIUM-6415.txt
Current directory ID: 109
Selected: CT-series-6216.txt
Current directory ID: 112
Selected: CT-INSPIRIUM-0916.txt
Current directory ID: 116
Selected: CT-INSPIRIUM-2148.txt
Current directory ID: 118
Selected: CT-INSPIRIUM-0002.txt
Current directory ID: 119
Selected: CT-INSPIRIUM-6238.txt
Current directory ID: 12
Selected: CT-INSPIRIUM-0921.txt
Current directory ID: 120
Selected: CT-SANS-IV-0002.txt
Current directory ID: 121
Selected: CT-INSPIRIUM-0844.txt
Current directory ID: 122
Selected: CT-INSPIRIUM-0002.txt
Current directory ID: 123
Selected: CT--0002.txt
Current directory ID: 124
Selected: CT-INSPIRIUM-2051.txt
Current directory ID: 126
Selected: CT-INSPIRIUM-8852.txt
Current directory ID: 127
Selected: CT-0002.txt
Current directory ID: 128
Selected: CT-Thx-HR-

In [9]:
# Data processing verification

X = [id for id in featureLabel.keys()]
y = [hasFibrosis for hasFibrosis in featureLabel.values()]

print(X)
print(y)

['101', '105', '107', '108', '109', '112', '116', '118', '119', '12', '120', '121', '122', '123', '124', '126', '127', '128', '129', '130', '131', '132', '134', '135', '136', '137', '138', '140', '142', '143', '144', '147', '149', '150', '152', '153', '154', '155', '157', '158', '159', '160', '162', '163', '164', '165', '166', '167', '168', '169', '17', '171', '172', '173', '174', '175', '177', '179', '180', '181', '182', '183', '184', '185', '19', '21', '23', '3', '34', '35', '36', '37', '38', '39', '41', '45', '46', '47', '48', '51', '53', '56', '57', '62', '65', '66', '7', '70', '74', '76', '77', '78', '8', '80', '81', '82', '83', '84', '89', '90', '92', '94', 'HRCT_pilot']
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]


In [10]:
# Creates dataframe

df = pd.DataFrame(X, columns=['PatientID'])
df['Class'] = y

In [11]:
# Visualizing dataframe and contents

df.head(5)

Unnamed: 0,PatientID,Class
0,101,1
1,105,0
2,107,0
3,108,0
4,109,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PatientID  103 non-null    object
 1   Class      103 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [13]:
df.describe()

Unnamed: 0,Class
count,103.0
mean,0.349515
std,0.479148
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [14]:
df.to_csv("id_label", index=False)