# Imports

In [20]:
import os
import pandas as pd
import re

# Utilities

In [44]:
def extract_metadata(base_dir):
    data = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".nii.gz"):
                file_path = os.path.join(root, file)
                path_parts = file_path.split(os.sep)
                patient_id = path_parts[-3]
                appointment_number = path_parts[-2]

                # Extract the scan type and instance number (including cases like _1a, _1b)
                match = re.match(r'([A-Za-z0-9_]+?)(?:_([0-9a-zA-Z]+))?\.nii\.gz', file)
                if match:
                    scan_type = match.group(1)
                    instance = match.group(2) if match.group(2) else '0'
                else:
                    scan_type = 'Unknown'
                    instance = '0'

                data.append({
                    "PatientID": patient_id,
                    "AppointmentNumber": appointment_number,
                    "ScanType": scan_type,
                    "Instance": instance,
                    "FilePath": file_path
                })

    df = pd.DataFrame(data)
    # Ensure AppointmentNumber is stored as an object (string)
    df['AppointmentNumber'] = df['AppointmentNumber'].astype(str)
    # Sort the dataframe by PatientID, AppointmentNumber, ScanType, and Instance
    df = df.sort_values(by=['PatientID', 'AppointmentNumber', 'ScanType', 'Instance']).reset_index(drop=True)
    return df

# Dataset

In [3]:
# setup project directory
dir_base = '/data/qte4288/Tumor-Risk-Prediction/'
dir_data = os.path.join(dir_base, 'data/NIFTI')

In [45]:
metadata_df = extract_metadata(dir_data)
metadata_df.info()
metadata_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3917 entries, 0 to 3916
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          3917 non-null   object
 1   AppointmentNumber  3917 non-null   object
 2   ScanType           3917 non-null   object
 3   Instance           3917 non-null   object
 4   FilePath           3917 non-null   object
dtypes: object(5)
memory usage: 153.1+ KB


Unnamed: 0,PatientID,AppointmentNumber,ScanType,Instance,FilePath
0,PT0001,1,FLAIR,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
1,PT0001,1,T1CE_GE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
2,PT0001,1,T1CE_SE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
3,PT0001,1,T1_SE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
4,PT0001,1,T2,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...


In [46]:
metadata_df

Unnamed: 0,PatientID,AppointmentNumber,ScanType,Instance,FilePath
0,PT0001,01,FLAIR,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
1,PT0001,01,T1CE_GE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
2,PT0001,01,T1CE_SE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
3,PT0001,01,T1_SE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
4,PT0001,01,T2,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
...,...,...,...,...,...
3912,PT0096,07,FLAIR,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
3913,PT0096,07,T1,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
3914,PT0096,07,T1CE_GE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...
3915,PT0096,07,T1_GE,1,/data/qte4288/Tumor-Risk-Prediction/data/NIFTI...


In [47]:
# Save to a CSV file if needed
metadata_df.to_csv("/home/qte4288/Projects/Tumor-Risk-Prediction/data/metadata.csv", index=False)