In [1]:
import pandas as pd
import re
import os
from pathlib import Path
import glob


In [2]:
#Transform the age range in a single value
def convert_age_range_to_midpoint(age_string):
    
    if pd.isna(age_string):
        return None
    
    numbers = re.findall(r'\d+', str(age_string))
    
    if len(numbers) >= 2:
        min_age = int(numbers[0])
        max_age = int(numbers[1])
        return (min_age + max_age) / 2
    elif len(numbers) == 1:
        
        return float(numbers[0])
    else:
        
        return None

# Transform the files Metadata_Controls_Release.csv and Metadata_Release_Anon.csv in the Subject folder in a single dataframe
def create_demographics_dataframe():
   
    controls_df = pd.read_csv(r'.\Subject\Metadata_Controls_Release.csv')
    release_df = pd.read_csv(r'.\Subject\Metadata_Release_Anon.csv')
    
    
    controls_subset = controls_df[['ID', 'Sex', 'Binned_Age_at_Scan']].copy()
    release_subset = release_df[['ID', 'Sex', 'Binned_Age_at_Scan']].copy()
    
    
    controls_subset['Age'] = controls_subset['Binned_Age_at_Scan'].apply(convert_age_range_to_midpoint)
    release_subset['Age'] = release_subset['Binned_Age_at_Scan'].apply(convert_age_range_to_midpoint)
    
    
    controls_subset = controls_subset[['ID', 'Sex', 'Age']] #We only keep the features in all patients
    release_subset = release_subset[['ID', 'Sex', 'Age']]
    
    
    combined_df = pd.concat([controls_subset, release_subset], ignore_index=True)
    
    combined_df = combined_df.drop_duplicates(subset=['ID'], keep='first')
    combined_df = combined_df.sort_values('ID').reset_index(drop=True)
    
    return combined_df

In [3]:
demographics_df = create_demographics_dataframe()

demographics_df.to_csv('combined_demographics.csv', index=False)
print("\nDataframe saved as 'combined_demographics.csv'")


Dataframe saved as 'combined_demographics.csv'


In [4]:
#Read a FreeSurfer stats file and return a DataFrame
def read_freesurfer_stats(filepath):
    
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    
    for i, line in enumerate(lines):
        if line.startswith('# ColHeaders'):
            headers = line.strip().split()[2:]  
            data_start_idx = i + 1
            break
    else:
        raise ValueError("No column headers found in the file.")
    
    
    data = []
    for line in lines[data_start_idx:]:
        if not line.startswith('#') and line.strip():  
            data.append(line.strip().split())
    
    df = pd.DataFrame(data, columns=headers)
   
   
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    
    return df

def load_additional_data(csv_filepath, subject_id_column='Subject_ID'):
    """
    Load additional patient data from CSV file
    
    Parameters:
    -----------
    csv_filepath : str
        Path to the CSV file containing additional patient data
    subject_id_column : str
        Name of the column containing subject IDs in the CSV
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with additional patient data, indexed by subject ID
    """
    try:
        # Read the CSV file
        additional_data = pd.read_csv(csv_filepath)
        
        # Convert subject ID column to string to match FreeSurfer data
        additional_data[subject_id_column] = additional_data[subject_id_column].astype(str)
        
        # Set subject ID as index
        additional_data.set_index(subject_id_column, inplace=True)
        
        print(f"Successfully loaded additional data from: {csv_filepath}")
        print(f"Additional data shape: {additional_data.shape}")
        print(f"Additional data columns: {list(additional_data.columns)}")
        
        return additional_data
        
    except Exception as e:
        print(f"Error loading additional data from {csv_filepath}: {str(e)}")
        return None


In [5]:
def process_lh_aparc_a2009s(base_dir):
   
    subject_base = os.path.join(base_dir, 'Subject')
    
    if not os.path.exists(subject_base):
        raise ValueError(f"Subject directory not found: {subject_base}")
    
    all_data = {}
    processed_subjects = []
    
    
    subject_dirs = [d for d in os.listdir(subject_base) 
                   if os.path.isdir(os.path.join(subject_base, d)) and d.isdigit()]
    
    print(f"Found {len(subject_dirs)} subject directories: {sorted(subject_dirs)}")
    
    for subject_id in subject_dirs:
        
        stats_path = os.path.join(subject_base, subject_id, 'stats', 'lh.aparc.a2009s.stats')
        
        if not os.path.exists(stats_path):
            print(f"Warning: {stats_path} not found for subject {subject_id}")
            continue
        
        try:
            
            df = read_freesurfer_stats(stats_path)
            
            # TO CHANGE TO INCLUDE LESS FEATURES
            measure_columns = ['SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']
            
            
            available_measures = [col for col in measure_columns if col in df.columns]
            
            if not available_measures:
                print(f"Warning: No standard measures found in {subject_id}")
                continue
            
            
            subject_data = {}
            for _, row in df.iterrows():
                region = row['StructName']  
                for measure in available_measures:
                    column_name = f"{measure}_{region}"
                    subject_data[column_name] = row[measure]
            
            all_data[subject_id] = subject_data
            processed_subjects.append(subject_id)
            
            
        except Exception as e:
            print(f"Error processing subject {subject_id}: {str(e)}")
            continue
    
    if not all_data:
        raise ValueError("No data was successfully processed")
    

      
    result_df = pd.DataFrame.from_dict(all_data, orient='index')
    result_df.index.name = 'Subject'
    
    
    result_df = result_df.reindex(sorted(result_df.index, key=lambda x: int(x)))
    
    
    def get_epileptic_status(subject_id):
        subject_num = int(subject_id)
        if 1 <= subject_num <= 463:
            return 'Epileptic'
        elif subject_num >= 4000:
            return 'Not Epileptic'
        else:
            return 'Unknown'
    
    result_df['Epileptic_Status'] = [get_epileptic_status(subj) for subj in result_df.index]
    
    cols = ['Epileptic_Status'] + [col for col in result_df.columns if col != 'Epileptic_Status']
    result_df = result_df[cols]
    
    other_cols = sorted([col for col in result_df.columns if col != 'Epileptic_Status'])
    result_df = result_df[['Epileptic_Status'] + other_cols]
    
    print(f"\nSuccessfully processed {len(processed_subjects)} subjects")
    print(f"Final dataset shape: {result_df.shape}")
    
    return result_df
    
def process_lh_aparc_a2009s_with_additional_data(base_dir, additional_csv_path=None, 
                                                subject_id_column='Subject_ID'):
    
    
    
    freesurfer_data = process_lh_aparc_a2009s(base_dir)
    
    if freesurfer_data is None:
        print("Failed to process FreeSurfer data")
        return None
    
    # If no additional CSV provided, return original data
    if additional_csv_path is None:
        print("No additional CSV provided, returning FreeSurfer data only")
        return freesurfer_data
    
    
    additional_data = load_additional_data(additional_csv_path, subject_id_column)
    
    if additional_data is None:
        print("Failed to load additional data, returning FreeSurfer data only")
        return freesurfer_data
    
    
    print(f"FreeSurfer data subjects: {sorted(freesurfer_data.index.tolist(), key=int)}")
    print(f"Additional data subjects: {sorted(additional_data.index.tolist(), key=lambda x: int(x) if x.isdigit() else float('inf'))}")
    
    
    merged_data = freesurfer_data.join(additional_data, how='left')
    
    
    freesurfer_subjects = set(freesurfer_data.index)
    additional_subjects = set(additional_data.index)
    
    matched_subjects = freesurfer_subjects.intersection(additional_subjects)
    freesurfer_only = freesurfer_subjects - additional_subjects
    additional_only = additional_subjects - freesurfer_subjects
    
    print(f"\nMerge Results:")
    print(f"  Subjects with both FreeSurfer and additional data: {len(matched_subjects)}")
    print(f"  Subjects with only FreeSurfer data: {len(freesurfer_only)}")
    
    
    if freesurfer_only:
        print(f"  FreeSurfer-only subjects: {sorted(list(freesurfer_only), key=int)}")
    
    if additional_only:
        print(f"  Additional-only subjects: {sorted(list(additional_only), key=lambda x: int(x) if x.isdigit() else float('inf'))}")
    
    
    epileptic_col = ['Epileptic_Status']
    additional_cols = [col for col in additional_data.columns if col in merged_data.columns]
    freesurfer_cols = [col for col in merged_data.columns 
                      if col not in epileptic_col and col not in additional_cols]
    
    final_column_order = epileptic_col + additional_cols + sorted(freesurfer_cols)
    merged_data = merged_data[final_column_order]
    
   
    
    return merged_data


  
def process_with_additional_data(): 
    
    base_directory = r""
    
    # Path to your additional CSV file 
    additional_csv_path = r'.\combined_demographics.csv'  # UPDATE THIS PATH
    
    subject_id_column = 'ID'  # UPDATE THIS IF THE COLUMN HAS A DIFFERENT NAME
    
    try:
        
        combined_data = process_lh_aparc_a2009s_with_additional_data(
            base_directory, 
            additional_csv_path, 
            subject_id_column
        )
        
        if combined_data is not None:
           
         
            epileptic_cols = [col for col in combined_data.columns if 'Epileptic' in col]
            additional_cols = [col for col in combined_data.columns 
                             if col not in epileptic_cols and not any(measure in col for measure in 
                             ['SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd'])]
            freesurfer_cols = [col for col in combined_data.columns 
                             if col not in epileptic_cols and col not in additional_cols]
            
            
            
           
            output_file = 'lh.csv'
            combined_data.to_csv(output_file)
            print(f"\nCombined data saved to: {output_file}")
            
            
            missing_summary = combined_data.isnull().sum()
            cols_with_missing = missing_summary[missing_summary > 0]
            
            if len(cols_with_missing) > 0:
                print(f"\nColumns with missing values:")
                for col, count in cols_with_missing.items():
                    percentage = (count / len(combined_data)) * 100
                    print(f"  {col}: {count} missing ({percentage:.1f}%)")
            else:
                print(f"\nNo missing values in the dataset!")
            
            return combined_data
        
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        return None



In [6]:
data = process_with_additional_data()
print(data)

Found 542 subject directories: ['1', '10', '100', '101', '102', '103', '104', '105', '106', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '14', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '21', '210', '211', '213', '214', '215', '216', '217', '218', '219', '22', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229', '23'

  df[col] = pd.to_numeric(df[col], errors='ignore')



Successfully processed 542 subjects
Final dataset shape: (542, 593)
Successfully loaded additional data from: .\combined_demographics.csv
Additional data shape: (542, 2)
Additional data columns: ['Sex', 'Age']
FreeSurfer data subjects: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '74', '75', '76', '77', '78', '79', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129'