# Jupyter notebook sample

In [1]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Data Definition

This defined the feature_data_location

In [2]:
# Defining data for the dataframe
data_paths = {
    "aseg_volumes": "data/aseg_volumes.txt",
    "lh_area": "data/lh_area.txt",
    "lh_thickness": "data/lh_thickness.txt",
    "lh_volume": "data/lh_volume.txt",
    "rh_area": "data/rh_area.txt",
    "rh_thickness": "data/rh_thickness.txt",
    "rh_volume": "data/rh_volume.txt"
}
#
data_frames = {}

### Reading each file to dataframe

In [3]:
for key, path in data_paths.items():
    try:
        print(f'Reading {path}')
        df = pd.read_csv(path, sep="\t")
        df.columns = [col.lower().replace('-', '_') for col in df.columns]
        df.set_index('subject_id', inplace=True)
        data_frames[key] = df
    except Exception as e:
        print(e.with_traceback)

Reading data/aseg_volumes.txt
Reading data/lh_area.txt
Reading data/lh_thickness.txt
Reading data/lh_volume.txt
Reading data/rh_area.txt
Reading data/rh_thickness.txt
Reading data/rh_volume.txt


In [4]:
# Merge with remaining dataframes, specifying unique suffixes for each merge
suffixes_map = {
    'aseg_volumes': '_aseg',
    'lh_area': '_lh_area',
    'lh_thickness': '_lh_thick',
    'lh_volume': '_lh_vol',
    'rh_area': '_rh_area',
    'rh_thickness': '_rh_thick',
    'rh_volume': '_rh_vol'
}

combined_feature_df = list(data_frames.values())[0]

# Skip the first dataframe since it's already in combined_df
for key, df in list(data_frames.items())[1:]:
    try:
        # Use unique suffixes for each merge
        combined_feature_df = pd.merge(
            combined_feature_df,
            df,
            left_index=True,
            right_index=True,
            how='outer',
            suffixes=('', suffixes_map[key])  # Empty string for left, unique suffix for right
        )
        print(f"Merged {key} into combined dataframe")
    except Exception as e:
        print(f"Error merging {key}: {e}")

# Sort by subject_id
combined_feature_df = combined_feature_df.sort_values(by='subject_id')

Merged lh_area into combined dataframe
Merged lh_thickness into combined dataframe
Merged lh_volume into combined dataframe
Merged rh_area into combined dataframe
Merged rh_thickness into combined dataframe
Merged rh_volume into combined dataframe


In [5]:
combined_feature_df.head()

Unnamed: 0_level_0,left_lateral_ventricle,left_inf_lat_vent,left_cerebellum_white_matter,left_cerebellum_cortex,left_thalamus,left_caudate,left_putamen,left_pallidum,3rd_ventricle,4th_ventricle,...,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume,brainsegvolnotvent_rh_vol,etiv_rh_vol
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sub-0010033,3530.3,362.4,14119.1,59849.2,7626.0,4328.3,5889.7,1710.6,778.9,2239.4,...,15798.0,13952.0,12314.0,1368.0,2102.0,1080.0,6485.0,1154368.0,1555280.0,
sub-0010039,3788.8,608.4,14945.3,57459.8,6921.0,3779.0,5825.4,1660.8,634.0,1365.9,...,14540.0,14238.0,10740.0,1582.0,2606.0,930.0,6174.0,1140274.0,1541012.0,
sub-0010058,5322.1,553.4,14231.3,64201.8,6967.5,4217.1,6774.6,1788.3,716.9,2270.6,...,16420.0,15134.0,13824.0,1740.0,2171.0,1026.0,6894.0,1256986.0,1691810.0,
sub-0010062,5852.3,663.3,19846.8,65733.1,9516.1,5000.9,6827.0,2237.7,833.8,1495.2,...,14379.0,13727.0,11240.0,1218.0,2167.0,1207.0,7221.0,1328274.0,1814578.0,
sub-0010069,5430.5,681.1,16942.4,63117.5,8503.8,4345.0,6176.3,1999.5,838.0,1772.8,...,15461.0,12962.0,12447.0,1590.0,3036.0,1127.0,6309.0,1338835.0,1864284.0,


### Load demographic data

In [6]:
demographic_data_path = "data/combined_participants_with_diagnosis.csv"
demographic_dataframe = pd.read_csv(demographic_data_path)
# Rename column participant_id to subject_id
demographic_dataframe.rename(columns={'participant_id': 'subject_id'}, inplace=True)
# Reset index
demographic_dataframe.index = demographic_dataframe['subject_id']
demographic_dataframe.head()

Unnamed: 0_level_0,subject_id,gender,age,handedness,verbal_iq,source_folder,adhd_index,adhd_measure,dx,performance_iq,...,qc_rest_4,hyper_impulsive,secondary_dx,study_#,secondary_dx,session,original_participant_id,diagnosis_status,age_group,gender_std
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sub-0000213,sub-0000213,2,,80.0,,ds003500-download,,,0,,...,,,,,,,sub-213,Typical Development,,2
sub-0000214,sub-0000214,2,,95.0,,ds003500-download,,,0,,...,,,,,,,sub-214,Typical Development,,2
sub-0000218,sub-0000218,1,,-90.0,,ds003500-download,,,0,,...,,,,,,,sub-218,Typical Development,,1
sub-0000219,sub-0000219,1,,90.0,,ds003500-download,,,1,,...,,,,,,,sub-219,ADHD,,1
sub-0000220,sub-0000220,1,,90.0,,ds003500-download,,,0,,...,,,,,,,sub-220,Typical Development,,1


#### Cleaning Demographic Data

In [7]:
def print_distinct_values(df):
    """
    Prints the distinct values for each column in the DataFrame,
    along with their counts and data types.
    """
    print("\n=== COLUMN VALUE ANALYSIS ===")

    for column in df.columns:
        # Get the data type
        dtype = df[column].dtype

        # Count null values
        null_count = df[column].isna().sum()
        null_percentage = (null_count / len(df)) * 100

        print(f"\nColumn: {column}")
        print(f"Data type: {dtype}")
        print(f"Null values: {null_count} ({null_percentage:.1f}%)")

        # For columns with many distinct values (like age), show statistics instead
        unique_count = df[column].nunique()

        if unique_count > 10 and pd.api.types.is_numeric_dtype(df[column]):
            # For numeric columns with many values, show statistics
            print(f"Unique values: {unique_count}")
            print(f"Min: {df[column].min()}")
            print(f"Max: {df[column].max()}")
            print(f"Mean: {df[column].mean():.2f}")
            print(f"Median: {df[column].median()}")
        else:
            # For columns with fewer unique values or non-numeric types,
            # show the actual values and their counts
            value_counts = df[column].value_counts(dropna=False).head(15)
            print(f"Unique values: {unique_count}")
            print("Value counts (top 15):")
            for value, count in value_counts.items():
                percentage = (count / len(df)) * 100
                value_display = str(value)
                if pd.isna(value):
                    value_display = "NULL/NaN"
                elif value == "":
                    value_display = "[empty string]"
                print(f"  {value_display}: {count} ({percentage:.1f}%)")

            # If there are more than 15 unique values, indicate there are more
            if unique_count > 15:
                print(f"  ... and {unique_count - 15} more values")


In [8]:
# Only select the necessary columns
demographic_dataframe = demographic_dataframe.loc[:, ['subject_id', 'gender', 'age', 'dx', 'Age', 'Gender', 'sex', 'diagnosis_status']]

# Convert all gender values to lowercase for consistency
if demographic_dataframe['gender'].dtype == 'object':
    demographic_dataframe['gender'] = demographic_dataframe['gender'].str.lower()

# Convert all Gender values to lowercase for consistency
if demographic_dataframe['Gender'].dtype == 'object':
    demographic_dataframe['Gender'] = demographic_dataframe['Gender'].str.lower()

# Convert all Gender values to lowercase for consistency
if demographic_dataframe['sex'].dtype == 'object':
    demographic_dataframe['sex'] = demographic_dataframe['sex'].str.lower()

# Map gender values to 1 (Male) and 2 (Female)
gender_mapping = {
    'm': 1, 'male': 1, 'man': 1, '1': 1, 1: 1,
    'f': 2, 'female': 2, 'woman': 2, '2': 2, 2: 2
}

demographic_dataframe['gender_code'] = demographic_dataframe['gender'].map(gender_mapping)
demographic_dataframe['Gender_code'] = demographic_dataframe['Gender'].map(gender_mapping)
demographic_dataframe['sex_code'] = demographic_dataframe['sex'].map(gender_mapping)

# Combined gender information
# Create a new column that combines gender_code, Gender_code, and sex_code
demographic_dataframe['combined_gender_code'] = None

# Fill values from each column in order of priority
# First try gender_code
demographic_dataframe['combined_gender_code'] = demographic_dataframe['gender_code']

# Then fill in missing values from Gender_code
mask = demographic_dataframe['combined_gender_code'].isna() & ~demographic_dataframe['Gender_code'].isna()
demographic_dataframe.loc[mask, 'combined_gender_code'] = demographic_dataframe.loc[mask, 'Gender_code']

# Finally fill in missing values from sex_code
mask = demographic_dataframe['combined_gender_code'].isna() & ~demographic_dataframe['sex_code'].isna()
demographic_dataframe.loc[mask, 'combined_gender_code'] = demographic_dataframe.loc[mask, 'sex_code']

# Drop the individual code columns if you want
demographic_dataframe = demographic_dataframe.drop(columns=['gender_code', 'Gender_code', 'sex_code', 'gender', 'Gender', 'sex'])

# You can rename the combined column to something more concise if needed
demographic_dataframe = demographic_dataframe.rename(columns={'combined_gender_code': 'gender'})

demographic_dataframe.head()

#print_distinct_values(demographic_dataframe)

Unnamed: 0_level_0,subject_id,age,dx,Age,diagnosis_status,gender
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sub-0000213,sub-0000213,,0,,Typical Development,2.0
sub-0000214,sub-0000214,,0,,Typical Development,2.0
sub-0000218,sub-0000218,,0,,Typical Development,1.0
sub-0000219,sub-0000219,,1,,ADHD,1.0
sub-0000220,sub-0000220,,0,,Typical Development,1.0


#### Clean up age

In [9]:
# Create a new column that combines Age and age
demographic_dataframe['combined_age'] = None

# Fill values from each column in order of priority
# First try age
demographic_dataframe['combined_age'] = demographic_dataframe['age']

# Then fill in missing values from Age
mask = demographic_dataframe['combined_age'].isna() & ~demographic_dataframe['Age'].isna()
demographic_dataframe.loc[mask, 'combined_age'] = demographic_dataframe.loc[mask, 'Age']

# Make sure age is numeric
demographic_dataframe['combined_age'] = pd.to_numeric(demographic_dataframe['combined_age'], errors='coerce')

# Drop the individual age columns if you want
demographic_dataframe = demographic_dataframe.drop(columns=['age', 'Age'])

# You can rename the combined column to the standard name
demographic_dataframe = demographic_dataframe.rename(columns={'combined_age': 'age'})

#### Clean up diagnosis

In [10]:

dx_mapping = {
    'Typically Developing Children': 1, 'ADHD-Combined': 1, 'ADHD-Inattentive': 1, 'ADHD-Hyperactive/Impulsive': 1, '1': 1, 1: 1, 'ADHD': 1,
    'Typically Developing Children': 0, '0': 0, 0: 0, 'Typical Development': 0
}

demographic_dataframe['dx_clean'] = demographic_dataframe['dx'].map(dx_mapping)
demographic_dataframe['diagnosis_status_clean'] = demographic_dataframe['diagnosis_status'].map(dx_mapping)

# Combine the two fields, prioritizing 'dx_clean' if available
demographic_dataframe['combined_dx'] = demographic_dataframe['dx_clean'].combine_first(demographic_dataframe['diagnosis_status_clean'])

# Optional: If there are still missing values, fill them with a default value (e.g., -1 or NaN)
demographic_dataframe['combined_dx'].fillna(np.nan, inplace=True)

# Drop intermediate columns if no longer needed
demographic_dataframe.drop(['dx_clean', 'diagnosis_status_clean'], axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  demographic_dataframe['combined_dx'].fillna(np.nan, inplace=True)


In [11]:

demographic_dataframe.drop(['dx', 'diagnosis_status'], axis=1, inplace=True)
demographic_dataframe.rename(columns={'combined_dx': 'label'}, inplace=True)
demographic_dataframe.head()

Unnamed: 0_level_0,subject_id,gender,age,label
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sub-0000213,sub-0000213,2.0,,0.0
sub-0000214,sub-0000214,2.0,,0.0
sub-0000218,sub-0000218,1.0,,0.0
sub-0000219,sub-0000219,1.0,,1.0
sub-0000220,sub-0000220,1.0,,0.0


### Combined extracted feature and demographic data

In [12]:
final_feature = pd.merge(combined_feature_df, demographic_dataframe, left_index=True, right_index=True, how='left')
print(final_feature.columns)
final_feature.head()

# Write the dataframe to a CSV file
output_filename = "data/final_features.csv"
final_feature.to_csv(output_filename, index=True)
print(f"Data successfully written to {output_filename}")


Index(['left_lateral_ventricle', 'left_inf_lat_vent',
       'left_cerebellum_white_matter', 'left_cerebellum_cortex',
       'left_thalamus', 'left_caudate', 'left_putamen', 'left_pallidum',
       '3rd_ventricle', '4th_ventricle',
       ...
       'rh_frontalpole_volume', 'rh_temporalpole_volume',
       'rh_transversetemporal_volume', 'rh_insula_volume',
       'brainsegvolnotvent_rh_vol', 'etiv_rh_vol', 'subject_id', 'gender',
       'age', 'label'],
      dtype='object', length=295)
Data successfully written to data/final_features.csv


### Create Training, Test and Validation Data

In [13]:
from sklearn.model_selection import train_test_split

test_df = final_feature[final_feature['label'].isna()].copy()
train_val_df = final_feature[final_feature['label'].notna()].copy()

# Separate features and labels for train/validation
X = train_val_df.drop(columns=['subject_id', 'label'])
y = train_val_df['label']

# Split into training and validation sets (e.g., 80% train, 20% validation)
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Prepare test set features (without labels)
X_test = test_df.drop(columns=['subject_id', 'label'])
y_test = test_df['label']

# Display the shapes to verify splits
print(f'Train shape: {X_train.shape}, Train labels: {y_train.shape}')
print(f'Validation shape: {X_validation.shape}, Validation labels: {y_validation.shape}')
print(f'Test shape (no labels): {X_test.shape}, expected labels: {y_test.shape}')

Train shape: (58, 293), Train labels: (58,)
Validation shape: (20, 293), Validation labels: (20,)
Test shape (no labels): (21, 293), expected labels: (21,)


### Initial ML Model

In [14]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

# Pipeline with imputation and SVC classifier
svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation (can also try median)
    ('svc', SVC(kernel='linear', random_state=42))
])

# Train the pipeline
svm_pipeline.fit(X_train, y_train)

# Predict on validation data
y_pred_validation = svm_pipeline.predict(X_validation)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_validation, y_pred_validation))
print("\nClassification Report:\n", classification_report(y_validation, y_pred_validation))


Validation Accuracy: 0.7

Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.85      0.79        13
         1.0       0.60      0.43      0.50         7

    accuracy                           0.70        20
   macro avg       0.67      0.64      0.64        20
weighted avg       0.69      0.70      0.69        20



 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.


### Cross Validation

In [15]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, make_scorer, precision_score, recall_score, f1_score

# Define your pipeline
svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation
    ('svc', SVC(kernel='linear', random_state=42))
])

# Define the number of folds (commonly 5 or 10)
n_folds = 5

# Create a stratified k-fold cross-validator
# Stratified ensures that each fold has the same proportion of class labels
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Define multiple scoring metrics to evaluate
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform cross-validation with multiple metrics
cv_results = cross_validate(
    svm_pipeline,
    X_train,  # Use your combined training+validation set here
    y_train,  # Use your combined training+validation labels here
    cv=cv,
    scoring=scoring,
    return_train_score=True
)

# Print the results
print(f"Cross-Validation Results ({n_folds} folds):")
print(f"Accuracy: {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Recall: {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
print(f"F1 Score: {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")

# If you want to see individual fold scores
for i, acc in enumerate(cv_results['test_accuracy']):
    print(f"Fold {i+1} Accuracy: {acc:.4f}")

 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.


Cross-Validation Results (5 folds):
Accuracy: 0.4818 ± 0.0811
Precision: 0.5196 ± 0.0892
Recall: 0.4818 ± 0.0811
F1 Score: 0.4856 ± 0.0795
Fold 1 Accuracy: 0.5833
Fold 2 Accuracy: 0.5000
Fold 3 Accuracy: 0.4167
Fold 4 Accuracy: 0.3636
Fold 5 Accuracy: 0.5455


 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.


### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier


# Pipeline with imputation and SVC classifier
rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation (can also try median)
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the pipeline
rf_pipeline.fit(X_train, y_train)

# Predict on validation data
y_pred_validation = rf_pipeline.predict(X_validation)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_validation, y_pred_validation))
print("\nClassification Report:\n", classification_report(y_validation, y_pred_validation))

Validation Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.92      0.83        13
         1.0       0.75      0.43      0.55         7

    accuracy                           0.75        20
   macro avg       0.75      0.68      0.69        20
weighted avg       0.75      0.75      0.73        20



 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.


### Random Forest Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, make_scorer, precision_score, recall_score, f1_score

# Pipeline with imputation and SVC classifier
rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation (can also try median)
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Define the number of folds (commonly 5 or 10)
n_folds = 10

# Create a stratified k-fold cross-validator
# Stratified ensures that each fold has the same proportion of class labels
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Define multiple scoring metrics to evaluate
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform cross-validation with multiple metrics
cv_results = cross_validate(
    rf_pipeline,
    X_train,  # Use your combined training+validation set here
    y_train,  # Use your combined training+validation labels here
    cv=cv,
    scoring=scoring,
    return_train_score=True
)

# Print the results
print(f"Cross-Validation Results ({n_folds} folds):")
print(f"Accuracy: {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Recall: {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
print(f"F1 Score: {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")

# If you want to see individual fold scores
for i, acc in enumerate(cv_results['test_accuracy']):
    print(f"Fold {i+1} Accuracy: {acc:.4f}")

 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.

Cross-Validation Results (10 folds):
Accuracy: 0.6900 ± 0.1613
Precision: 0.6011 ± 0.2171
Recall: 0.6900 ± 0.1613
F1 Score: 0.6274 ± 0.1820
Fold 1 Accuracy: 0.6667
Fold 2 Accuracy: 0.6667
Fold 3 Accuracy: 0.6667
Fold 4 Accuracy: 0.8333
Fold 5 Accuracy: 0.6667
Fold 6 Accuracy: 0.6667
Fold 7 Accuracy: 0.5000
Fold 8 Accuracy: 0.8333
Fold 9 Accuracy: 0.4000
Fold 10 Accuracy: 1.0000


 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
 'etiv_rh_vol']. At least one non-missing value is needed for imputation with strategy='mean'.
