In [15]:
import pandas as pd
import numpy as np
import pickle
import os
from dataset import TissueDataset, LungDataset
train_dataset = LungDataset(os.path.join(f"../data/Lung"), "Progression", train=True)
test_dataset = LungDataset(os.path.join(f"../data/Lung"), "Progression", train=False)

Seed set to 42
Seed set to 42


Target prediction: Progression
processed_paths ['../data/Lung/processed/data_Progression_train.pt']
Target prediction: Progression
processed_paths ['../data/Lung/processed/data_Progression_test.pt']


In [17]:
def create_stratified_cv_folds_fixed(dataset, clinical_outcome_label, n_folds=5, random_state=42):
    """
    Create stratified cross-validation folds with equal class representation and patient-level stratification.
    FIXED VERSION - No duplicate indices.
    
    Parameters:
    -----------
    dataset : PyTorch Dataset
        The dataset containing samples with clinical outcomes
    clinical_outcome_label : str
        The clinical outcome of interest (e.g., "Progression")
    n_folds : int, default=5
        Number of cross-validation folds
    random_state : int, default=42
        Random seed for reproducibility
        
    Returns:
    --------
    dict : Dictionary where keys are fold numbers (0 to n_folds-1) and values are 
           lists containing [train_indices, validation_indices]
    """
    
    # Extract sample information
    sample_data = []
    
    for idx, data in enumerate(dataset):
        sample_id = data.sample_id
        # Extract patient ID using the helper function
        patient_id = extract_patient_id(sample_id)
        
        sample_data.append({
            'sample_idx': idx,
            'sample_id': sample_id,
            'patient_id': patient_id,
            'outcome': data.y.item() if hasattr(data.y, 'item') else data.y
        })
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(sample_data)
    print(df)
    # Count class distribution
    class_counts = df['outcome'].value_counts()
    print(f"Original class distribution:")
    for class_val, count in class_counts.items():
        print(f"  Class {class_val}: {count} samples")
    
    # Find the minority class count
    minority_class_count = min(class_counts.values)
    print(f"Minority class count: {minority_class_count}")
    
    # Get samples for each class
    class_0_samples = df[df['outcome'] == 0].copy()
    class_1_samples = df[df['outcome'] == 1].copy()
    
    # Downsample majority class to match minority class
    if len(class_0_samples) > minority_class_count:
        class_0_samples = class_0_samples.sample(n=minority_class_count, random_state=random_state)
    if len(class_1_samples) > minority_class_count:
        class_1_samples = class_1_samples.sample(n=minority_class_count, random_state=random_state)
    
    # Combine balanced samples
    balanced_df = pd.concat([class_0_samples, class_1_samples]).reset_index(drop=True)
    
    print(f"Balanced class distribution:")
    balanced_class_counts = balanced_df['outcome'].value_counts()
    for class_val, count in balanced_class_counts.items():
        print(f"  Class {class_val}: {count} samples")
    
    # Create patient-level stratification
    # Get unique patients and their outcomes
    patient_outcomes = balanced_df.groupby('patient_id')['outcome'].first().reset_index()
    
    # Create stratified folds at patient level
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    folds_dict = {}
    
    for fold_idx, (train_patient_indices, val_patient_indices) in enumerate(skf.split(patient_outcomes['patient_id'], patient_outcomes['outcome'])):
        # Get patient IDs for this fold
        train_patients = patient_outcomes.iloc[train_patient_indices]['patient_id'].tolist()
        val_patients = patient_outcomes.iloc[val_patient_indices]['patient_id'].tolist()
        
        # Get sample indices for training and validation
        train_sample_indices = balanced_df[balanced_df['patient_id'].isin(train_patients)]['sample_idx'].tolist()
        val_sample_indices = balanced_df[balanced_df['patient_id'].isin(val_patients)]['sample_idx'].tolist()
        
        folds_dict[fold_idx] = [train_sample_indices, val_sample_indices]
        
        # Print fold statistics
        train_outcomes = balanced_df[balanced_df['sample_idx'].isin(train_sample_indices)]['outcome']
        val_outcomes = balanced_df[balanced_df['sample_idx'].isin(val_sample_indices)]['outcome']
        
        print(f"\nFold {fold_idx}:")
        print(f"  Training: {len(train_sample_indices)} samples from {len(train_patients)} patients")
        print(f"  Validation: {len(val_sample_indices)} samples from {len(val_patients)} patients")
        print(f"  Training class distribution: {train_outcomes.value_counts().to_dict()}")
        print(f"  Validation class distribution: {val_outcomes.value_counts().to_dict()}")
    
    return folds_dict


In [18]:
num_of_ones = 0
num_of_zeros = 0
for data in train_dataset:
    if data["y"][0] == 1:
        num_of_ones+= 1
    else:
        num_of_zeros += 1
print(num_of_ones, num_of_zeros)
print(num_of_ones/len(train_dataset))
print(num_of_zeros/len(train_dataset))
print(num_of_ones/len(train_dataset))
print(num_of_zeros/len(train_dataset))


236 1308
0.15284974093264247
0.8471502590673575
0.15284974093264247
0.8471502590673575


In [19]:
def validate_folds(dataset, folds):
    """
    Validate that the folds are properly stratified and have no data leakage.
    
    Parameters:
    -----------
    dataset : PyTorch Dataset
        The dataset containing samples
    folds : dict
        Dictionary containing fold information
        
    Returns:
    --------
    bool : True if validation passes, False otherwise
    """
    print("Validating folds...")
    
    # Check for data leakage (same patient in train and val)
    for fold_idx, (train_indices, val_indices) in folds.items():
        train_patients = set()
        val_patients = set()
        
        # Get patient IDs for training set
        for idx in train_indices:
            sample_id = dataset[idx].sample_id
            patient_id = extract_patient_id(sample_id)
            train_patients.add(patient_id)
        
        # Get patient IDs for validation set
        for idx in val_indices:
            sample_id = dataset[idx].sample_id
            patient_id = extract_patient_id(sample_id)
            val_patients.add(patient_id)
        
        # Check for overlap
        overlap = train_patients.intersection(val_patients)
        if overlap:
            print(f"Data leakage detected in fold {fold_idx}: {overlap}")
            return False
        else:
            print(f"Fold {fold_idx}: No data leakage detected")

    print("All folds validated successfully")
    return True

# Test the validation function
validation_result = validate_folds(train_dataset, folds)


Validating folds...
Fold 0: No data leakage detected
Fold 1: No data leakage detected
Fold 2: No data leakage detected
Fold 3: No data leakage detected
Fold 4: No data leakage detected
All folds validated successfully


In [20]:
# Example usage: How to use the folds for cross-validation
def example_cross_validation_usage(dataset, folds):
    """
    Example of how to use the folds for cross-validation training.
    """
    print("Example cross-validation usage:")
    
    for fold_idx, (train_indices, val_indices) in folds.items():
        print(f"\n--- Fold {fold_idx} ---")
        
        # Create training and validation datasets
        train_samples = [dataset[i] for i in train_indices]
        val_samples = [dataset[i] for i in val_indices]
        
        print(f"Training samples: {len(train_samples)}")
        print(f"Validation samples: {len(val_samples)}")
        
        # Example: Get class distribution for this fold
        train_labels = [sample.y.item() if hasattr(sample.y, 'item') else sample.y for sample in train_samples]
        val_labels = [sample.y.item() if hasattr(sample.y, 'item') else sample.y for sample in val_samples]
        
        train_class_dist = {0: train_labels.count(0), 1: train_labels.count(1)}
        val_class_dist = {0: val_labels.count(0), 1: val_labels.count(1)}
        
        print(f"Training class distribution: {train_class_dist}")
        print(f"Validation class distribution: {val_class_dist}")
        
        # Here you would typically:
        # 1. Create DataLoaders for train and val sets
        # 2. Train your model on train_samples
        # 3. Evaluate on val_samples
        # 4. Store the results
        
        # For demonstration, just show the first few sample IDs
        print(f"First 3 training sample IDs: {[sample.sample_id for sample in train_samples[:3]]}")
        print(f"First 3 validation sample IDs: {[sample.sample_id for sample in val_samples[:3]]}")

# Run the example
example_cross_validation_usage(train_dataset, folds)


Example cross-validation usage:

--- Fold 0 ---
Training samples: 379
Validation samples: 93
Training class distribution: {0: 187, 1: 192}
Validation class distribution: {0: 49, 1: 44}
First 3 training sample IDs: ['LUAD_D017_LUAD_D017', 'LUAD_D094lr_LUAD_D094', 'LUAD_D208ur_LUAD_D208']
First 3 validation sample IDs: ['LUAD_D035lr_LUAD_D035', 'LUAD_D140lr_LUAD_D140', 'LUAD_D193ll_LUAD_D193']

--- Fold 1 ---
Training samples: 372
Validation samples: 100
Training class distribution: {0: 185, 1: 187}
Validation class distribution: {0: 51, 1: 49}
First 3 training sample IDs: ['LUAD_D017_LUAD_D017', 'LUAD_D094lr_LUAD_D094', 'LUAD_D026ll_LUAD_D026']
First 3 validation sample IDs: ['LUAD_D208ur_LUAD_D208', 'LUAD_D181ur_LUAD_D181', 'LUAD_D361ul_LUAD_D361']

--- Fold 2 ---
Training samples: 375
Validation samples: 97
Training class distribution: {0: 185, 1: 190}
Validation class distribution: {0: 51, 1: 46}
First 3 training sample IDs: ['LUAD_D017_LUAD_D017', 'LUAD_D094lr_LUAD_D094', 'LUAD_D208

In [11]:
# Test the fixed function
print("Testing the FIXED function...")
folds_fixed = create_stratified_cv_folds_fixed(train_dataset, "Progression", n_folds=5, random_state=42)

# Validate the fixed folds
validation_result = validate_folds(train_dataset, folds_fixed)


Testing the FIXED function...
      sample_idx              sample_id patient_id  outcome
0              0  LUAD_D352ur_LUAD_D352  LUAD_D352        0
1              1  LUAD_D397ll_LUAD_D397  LUAD_D397        0
2              2  LUAD_D164lr_LUAD_D164  LUAD_D164        0
3              3  LUAD_D198lr_LUAD_D198  LUAD_D198        1
4              4  LUAD_D394ll_LUAD_D394  LUAD_D394        0
...          ...                    ...        ...      ...
1539        1539  LUAD_D275ur_LUAD_D275  LUAD_D275        0
1540        1540    LUAD_D260_LUAD_D260  LUAD_D260        0
1541        1541  LUAD_D212lr_LUAD_D212  LUAD_D212        0
1542        1542  LUAD_D363ur_LUAD_D363  LUAD_D363        0
1543        1543  LUAD_D036ul_LUAD_D036  LUAD_D036        0

[1544 rows x 4 columns]
Original class distribution:
  Class 0: 1308 samples
  Class 1: 236 samples
Minority class count: 236
Balanced class distribution:
  Class 0: 236 samples
  Class 1: 236 samples

Fold 0:
  Training: 379 samples from 192 patien

In [12]:
# Test the fixed function with corrected patient ID extraction
print("Testing the FIXED function with corrected patient ID extraction...")
folds_corrected = create_stratified_cv_folds_fixed(train_dataset, "Progression", n_folds=5, random_state=42)

# Validate the corrected folds
validation_result = validate_folds(train_dataset, folds_corrected)


Testing the FIXED function with corrected patient ID extraction...
      sample_idx              sample_id patient_id  outcome
0              0  LUAD_D352ur_LUAD_D352  LUAD_D352        0
1              1  LUAD_D397ll_LUAD_D397  LUAD_D397        0
2              2  LUAD_D164lr_LUAD_D164  LUAD_D164        0
3              3  LUAD_D198lr_LUAD_D198  LUAD_D198        1
4              4  LUAD_D394ll_LUAD_D394  LUAD_D394        0
...          ...                    ...        ...      ...
1539        1539  LUAD_D275ur_LUAD_D275  LUAD_D275        0
1540        1540    LUAD_D260_LUAD_D260  LUAD_D260        0
1541        1541  LUAD_D212lr_LUAD_D212  LUAD_D212        0
1542        1542  LUAD_D363ur_LUAD_D363  LUAD_D363        0
1543        1543  LUAD_D036ul_LUAD_D036  LUAD_D036        0

[1544 rows x 4 columns]
Original class distribution:
  Class 0: 1308 samples
  Class 1: 236 samples
Minority class count: 236
Balanced class distribution:
  Class 0: 236 samples
  Class 1: 236 samples

Fold 0:
  

In [13]:
# Test patient ID extraction with actual sample IDs from the dataset
print("Testing patient ID extraction with actual dataset samples:")
print("First 10 samples from the dataset:")

for i in range(min(10, len(train_dataset))):
    sample_id = train_dataset[i].sample_id
    patient_id = extract_patient_id(sample_id)
    print(f"  Sample {i}: {sample_id} -> Patient: {patient_id}")

# Check for any patterns in patient IDs
print("\nChecking patient ID patterns:")
patient_ids = set()
for i in range(min(50, len(train_dataset))):  # Check first 50 samples
    sample_id = train_dataset[i].sample_id
    patient_id = extract_patient_id(sample_id)
    patient_ids.add(patient_id)

print(f"Unique patient IDs found in first 50 samples: {len(patient_ids)}")
print(f"Sample patient IDs: {sorted(list(patient_ids))[:10]}...")  # Show first 10


Testing patient ID extraction with actual dataset samples:
First 10 samples from the dataset:
  Sample 0: LUAD_D352ur_LUAD_D352 -> Patient: LUAD_D352
  Sample 1: LUAD_D397ll_LUAD_D397 -> Patient: LUAD_D397
  Sample 2: LUAD_D164lr_LUAD_D164 -> Patient: LUAD_D164
  Sample 3: LUAD_D198lr_LUAD_D198 -> Patient: LUAD_D198
  Sample 4: LUAD_D394ll_LUAD_D394 -> Patient: LUAD_D394
  Sample 5: LUAD_D394ll_LUAD_D394 -> Patient: LUAD_D394
  Sample 6: LUAD_D244ur_LUAD_D244 -> Patient: LUAD_D244
  Sample 7: LUAD_D023ul_LUAD_D023 -> Patient: LUAD_D023
  Sample 8: LUAD_D254lr_LUAD_D254 -> Patient: LUAD_D254
  Sample 9: LUAD_D297lr_LUAD_D297 -> Patient: LUAD_D297

Checking patient ID patterns:
Unique patient IDs found in first 50 samples: 48
Sample patient IDs: ['LUAD_D001', 'LUAD_D002', 'LUAD_D008', 'LUAD_D023', 'LUAD_D028', 'LUAD_D032', 'LUAD_D041', 'LUAD_D042', 'LUAD_D043', 'LUAD_D053']...


In [None]:
img_pid = "LUAD_D001ll_LUAD_D001"
path = "/home/rifaioglu/projects/GNNClinicalOutcomePrediction/data/Lung"
with open(os.path.join(path, "raw", f'{img_pid}_features.pickle'), 'rb') as handle:
                    feature_arr = pickle.load(handle)
                    feature_arr = np.array(feature_arr)
with open(os.path.join(path, "raw", f'{img_pid}_edge_index_length.pickle'), 'rb') as handle:
    edge_index_arr, edge_length_arr = pickle.load(handle)
    edge_index_arr = np.array(edge_index_arr)

with open(os.path.join(path, "raw", f'{img_pid}_ct_class.pickle'), 'rb') as handle:
    ct_class_arr = pickle.load(handle)
    ct_class_arr = np.array(ct_class_arr)

with open(os.path.join(path, "raw", f'{img_pid}_coordinates.pickle'), 'rb') as handle:
    coordinates_arr = pickle.load(handle)
    coordinates_arr = np.array(coordinates_arr)

with open(os.path.join(path, "raw", f'{img_pid}_clinical_info.pickle'), 'rb') as handle:
    clinical_info_dict = pickle.load(handle)
    clinical_info_dict = np.array(clinical_info_dict)

In [None]:
clinical_info_dict

In [None]:
from data_processing import data_processing_lung_pipeline
data_processing_lung_pipeline("/home/rifaioglu/projects/GNNClinicalOutcomePrediction/data/Lung/raw/merged_preprocessed_dataset.csv")