In [2]:
import sys
sys.path.append('/home/qte4288/Projects/Tumor-Risk-Prediction/scripts')
from setup import create_experiment_folders, create_model_folders

In [8]:
import pandas as pd
import os
from IPython.display import display, HTML
import os
import nibabel as nib
import pandas as pd
from datetime import datetime


In [30]:
# setup project directory
dir_base = '/data/qte4288/Tumor-Risk-Prediction/'

dir_data = os.path.join(dir_base, 'data/sample')
pth_metadata = os.path.join(dir_data, 'metadata.csv')
pth_metadata_mri = os.path.join(dir_data, 'metadata_mri.csv')


In [18]:
df_metadata = pd.read_csv(pth_metadata, dtype={'scan_folder': str})
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   patient_id   18 non-null     object
 1   time         18 non-null     object
 2   scan_folder  18 non-null     object
 3   label        18 non-null     bool  
dtypes: bool(1), object(3)
memory usage: 582.0+ bytes


In [19]:
df_metadata

Unnamed: 0,patient_id,time,scan_folder,label
0,PT0001,13-03-19,1,True
1,PT0001,13-03-20,2,False
2,PT0001,13-03-21,3,False
3,PT0001,13-03-22,4,False
4,PT0001,13-03-23,5,True
5,PT0001,13-03-24,6,True
6,PT0002,24-09-10,1,True
7,PT0002,24-09-11,2,False
8,PT0002,24-09-12,3,False
9,PT0002,24-09-13,4,False


In [31]:
def read_metadata(pth_metadata, base_dir):
    df_metadata = pd.read_csv(pth_metadata, dtype={'scan_folder': str})
    
    # Convert label to numerical
    df_metadata['label'] = df_metadata['label'].apply(lambda x: 1 if x == 'TRUE' else 0)
    
    # Convert time to datetime
    df_metadata['time'] = pd.to_datetime(df_metadata['time'], format='%d-%m-%y')
    
    # Calculate days since first scan
    df_metadata['days'] = df_metadata.groupby('patient_id')['time'].transform(lambda x: (x - x.min()).dt.days)
    
    # Create dir_scan column
    df_metadata['dir_scan'] = df_metadata.apply(
        lambda row: os.path.join(base_dir, row['patient_id'], row['scan_folder']), axis=1
    )
    
    return df_metadata

In [35]:
def load_scan(scan_path):
    """Load an individual MRI scan using nibabel."""
    return nib.load(scan_path).get_fdata()

def load_patient_data(dir_scan):
    """Load all MRI scans for a given scan directory."""
    scans = {}
    for scan in os.listdir(dir_scan):
        scan_type = os.path.splitext(scan)[0]
        scan_path = os.path.join(dir_scan, scan)
        scans[scan_type] = load_scan(scan_path)
    return scans

def load_all_data(metadata):
    """Load data for all patients based on the metadata CSV."""
    data = {}
    for _, row in metadata.iterrows():
        patient_id = row['patient_id']
        if patient_id not in data:
            data[patient_id] = []
        data[patient_id].append(load_patient_data(row['dir_scan']))
    return data

def create_metadata_mri_df(metadata):
    """Create a new DataFrame with MRI scan types included."""
    new_rows = []
    
    for _, row in metadata.iterrows():
        scan_dir = row['dir_scan']
        if os.path.exists(scan_dir):
            for scan_file in os.listdir(scan_dir):
                scan_type = os.path.splitext(scan_file)[0]
                scan_type = scan_type.split('_')[0]  # Extract the base scan type (e.g., 'FLAIR' from 'FLAIR_1.nii')
                new_row = row.copy()
                new_row['mri'] = scan_type
                new_rows.append(new_row)
    
    new_df = pd.DataFrame(new_rows)
    
    # Reorder columns
    cols = list(new_df.columns)
    cols.remove('dir_scan')
    cols.remove('label')
    cols.append('label')
    cols.append('dir_scan')
    new_df = new_df[cols]
    
    return new_df

def save_metadata_mri_df(df, save_path):
    """Save the new metadata DataFrame with MRI scan types to a CSV file."""
    df.to_csv(save_path, index=False)

In [36]:
# Process data
df_metadata = read_metadata(pth_metadata, dir_data)
df_metadata_mri = create_metadata_mri_df(df_metadata)

# Save the new metadata DataFrame with MRI scan types
save_metadata_mri_df(df_metadata_mri, pth_metadata_mri)

# Output to verify data loading
print(df_metadata_mri.head())
print(f"Saved metadata with MRI scan types to {pth_metadata_mri}")

  patient_id       time scan_folder  days    mri  label  \
0     PT0001 2019-03-13          01     0  FLAIR      0   
0     PT0001 2019-03-13          01     0   T1CE      0   
0     PT0001 2019-03-13          01     0   T1CE      0   
0     PT0001 2019-03-13          01     0     T1      0   
0     PT0001 2019-03-13          01     0     T2      0   

                                            dir_scan  
0  /data/qte4288/Tumor-Risk-Prediction/data/sampl...  
0  /data/qte4288/Tumor-Risk-Prediction/data/sampl...  
0  /data/qte4288/Tumor-Risk-Prediction/data/sampl...  
0  /data/qte4288/Tumor-Risk-Prediction/data/sampl...  
0  /data/qte4288/Tumor-Risk-Prediction/data/sampl...  
Saved metadata with MRI scan types to /data/qte4288/Tumor-Risk-Prediction/data/sample/metadata_mri.csv
