# WM and GM distribution in the pediatric spinal cord

This jupyter notebook includes scripts to generate figures related to the white matter and gray matter distribution in the pediatric spinal cord.

In [9]:
import os
import pandas as pd
import json
import yaml
import re
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import webbrowser
import statsmodels.formula.api as smf

### Load config file to get path to dataset 

In [10]:
# Load config file
with open('../../config/config_preprocessing.yaml' , 'r') as file:
    config = yaml.safe_load(file)

# Get data path from config file
path_data = config['path_data']

### Get the `participants.tsv` file from the dataset

In [11]:
# Get path to participants.tsv file
participants_tsv = pd.read_csv(os.path.join(path_data, 'participants.tsv'), sep='\t')
participants_tsv

Unnamed: 0,participant_id,age,sex,group,scan_series,height,weight
0,sub-101,17,M,control,complete,1.778004,68.038864
1,sub-102,15,F,control,complete,1.625603,52.163129
2,sub-103,15,M,control,complete,1.651003,54.431091
3,sub-104,15,F,control,complete,1.625603,52.163129
4,sub-105,13,M,control,complete,1.524000,35.381000
...,...,...,...,...,...,...,...
110,sub-214,6,M,control,complete,,
111,sub-215,16,F,control,complete,,
112,sub-216,15,F,control,complete,,
113,sub-217,15,M,control,complete,,


## Dataframe of subjects included in the pipeline analysis

The following dataframe contains only the subjects that were included in this pipeline analysis.

In [47]:
def get_list_of_subjects_to_include(contrast, path_data, missing_data_subjects):
    """
    This function takes an image contrast (T2w, dwi, etc.), a path to a dataset, and a list of subjects with missing data,
    and returns a list of subjects to include in the analysis.

    The dataset needs to be in BIDS format, and the function will look for the participants.tsv file to get the list of subjects.
    The dataset should also contain an `exclude.yml` file that lists subjects to exclude from the analysis.
    """

    # Get the `participants.tsv` file and read it into a dataframe
    participants_tsv = pd.read_csv(os.path.join(path_data, 'participants.tsv'), sep='\t')

    # Get all subject IDs from the participants.tsv
    all_subjects = participants_tsv['participant_id'].tolist()

    # Get list of subjects to exclude from the analysis from the `exclude.yml` file
    with open(os.path.join(path_data, 'exclude.yml'), 'r') as file:
        exclude_yml = yaml.safe_load(file)

    exclude_t2star_key = exclude_yml.get(contrast, []) # Extract subjects under contrast key
    exclude_subjects = sorted(set(re.match(r"(sub-\d+)", entry).group(1) for entry in exclude_t2star_key if re.match(r"(sub-\d+)", entry))) # Extract the subject ID 

    # Add the list of subjects with missing data to the exclude_subjects list
    exclude_subjects.extend(missing_data_subjects)
    
    # Remove duplicates (if any), sort and print the list of subjects to exclude from the analysis
    exclude_subjects = sorted(set(exclude_subjects))
    print(f'subjects to exclude : {exclude_subjects}')

    # Compute the list of subjects to include in the analysis 
    include_subjects = [sub for sub in all_subjects if sub not in exclude_subjects]

    # Convert the list of included subjects to a dataframe
    include_subjects = participants_tsv[participants_tsv['participant_id'].isin(include_subjects)]

    return include_subjects

In [58]:
# List of subjects with missing dwi data
missing_t2star_subjects = ["sub-125",
                        "sub-136",
                        "sub-152",
                        "sub-159",
                        "sub-174",
                        "sub-198",
                        "sub-200",
                        "sub-205",
                        "sub-211",
                        "sub-213"]

# Get the list of subjects to include in the analysis
include_t2star_subjects = get_list_of_subjects_to_include('t2starw', path_data, missing_t2star_subjects)
include_t2star_subjects.to_csv(os.path.join('../tables/WMGM_distribution/include_t2star_subjects.csv'), sep='\t', index=False)

print(include_t2star_subjects.shape[0], "subjects to include in the analysis")
print(f"\n list of subjects to include : \n {include_t2star_subjects['participant_id'].tolist()}")

include_t2star_subjects

subjects to exclude : ['sub-106', 'sub-107', 'sub-110', 'sub-120', 'sub-124', 'sub-125', 'sub-136', 'sub-139', 'sub-141', 'sub-144', 'sub-150', 'sub-152', 'sub-154', 'sub-159', 'sub-160', 'sub-168', 'sub-169', 'sub-170', 'sub-171', 'sub-172', 'sub-174', 'sub-181', 'sub-188', 'sub-189', 'sub-190', 'sub-191', 'sub-193', 'sub-194', 'sub-196', 'sub-198', 'sub-199', 'sub-200', 'sub-203', 'sub-204', 'sub-205', 'sub-208', 'sub-209', 'sub-211', 'sub-212', 'sub-213', 'sub-214']
74 subjects to include in the analysis

 list of subjects to include : 
 ['sub-101', 'sub-102', 'sub-103', 'sub-104', 'sub-105', 'sub-108', 'sub-109', 'sub-111', 'sub-112', 'sub-113', 'sub-114', 'sub-115', 'sub-116', 'sub-117', 'sub-118', 'sub-119', 'sub-121', 'sub-122', 'sub-123', 'sub-126', 'sub-127', 'sub-128', 'sub-129', 'sub-130', 'sub-131', 'sub-132', 'sub-133', 'sub-134', 'sub-135', 'sub-137', 'sub-138', 'sub-140', 'sub-142', 'sub-143', 'sub-145', 'sub-146', 'sub-147', 'sub-148', 'sub-149', 'sub-151', 'sub-153', '

Unnamed: 0,participant_id,age,sex,group,scan_series,height,weight
0,sub-101,17,M,control,complete,1.778004,68.038864
1,sub-102,15,F,control,complete,1.625603,52.163129
2,sub-103,15,M,control,complete,1.651003,54.431091
3,sub-104,15,F,control,complete,1.625603,52.163129
4,sub-105,13,M,control,complete,1.524000,35.381000
...,...,...,...,...,...,...,...
106,sub-210,6,F,control,complete,,
111,sub-215,16,F,control,complete,,
112,sub-216,15,F,control,complete,,
113,sub-217,15,M,control,complete,,


## Plot demographics

This function plots the age and sex distribution of the subjects included in a pipeline analysis, according to the include list generated above. 

In [49]:
def plot_demographics(df):
    """
    This function plots the demographic information of participants, given a dataframe with the list of subjects to include in the analysis.
    """

    # Sort by sex
    df_M = df[df['sex'] == 'M']
    df_F = df[df['sex'] == 'F']

    # Round down age to nearest month 
    df['age'] = np.floor(df['age']) 

    # Create subplot
    fig = make_subplots(rows=1, cols=1)

    # Add histogram for female subjects
    fig.add_trace(go.Histogram(
        x=df_F['age'], 
        name='F', 
        marker=dict(color= "#D19D88"),
        opacity=1.0,
        legendgroup='F',
        ),
        row=1, col=1
    )

    # Add histogram for male subjects
    fig.add_trace(go.Histogram(
        x=df_M['age'], 
        name='M', 
        
        marker=dict(color="#5C8EA1"),
        opacity=1.0,
        legendgroup='M',
        ), 
        row=1, col=1
    )

    # Define age tick range
    tick_vals = list(range(6, 18)) 

    # Update layout
    fig.update_layout(
        width=900,
        height=500,
        font=dict(family='Arial', size=20, color='black'), 
        legend=dict(
            orientation="h", 
            yanchor="bottom", 
            y=1.0, 
            xanchor="center",  
            x=0.5,
        ),
        xaxis=dict(
            range=[5, 18],  # Set x-axis range from 6 to 17
        ),
        plot_bgcolor='white',
        barmode='stack',
        bargap=0.3,  
        xaxis_title='Age (years)',
        xaxis_title_font=dict(family='Arial', size=20, weight='bold'),
        yaxis_title='Number of Subjects',
        yaxis_title_font=dict(family='Arial', size=20, weight='bold'),
        xaxis_title_standoff=50, 
    )

    fig.update_xaxes(
        tickmode='array',
        tickvals=tick_vals,
        showgrid=False,
        gridwidth=1
    )

    fig.update_yaxes(
        showgrid=True,             # Horizontal grid lines
        gridcolor='lightgrey',
        gridwidth=1
    )

    # Set bin size to 1 year
    fig.update_traces(xbins=dict(size=1))

    fig.show()

In [50]:
# Plot demographics for included subjects in DWI analysis
plot_demographics(include_t2star_subjects)

# Create a dataframe for GM, WM and SC CSA (all subjects combined)

In [59]:
# Path to WM, GM and SC CSA (one file per subject)
CSA_base_folder = "../tables/WMGM_distribution/"
labels = ['WM', 'GM', 'SC']

WMGM_distribution_df = {}

for label in labels:
    label_folder = os.path.join(CSA_base_folder, label)
    label_dfs = []

    if not os.path.exists(label_folder):
        print(f"Folder not found: {label_folder}")
        continue
    
    for filename in os.listdir(label_folder):
        if filename.endswith(".csv"):
            if not any(sub in filename for sub in include_t2star_subjects['participant_id'].tolist()):
                continue  # Skip subjects not in the include list
            subject_path = os.path.join(label_folder, filename)
            df = pd.read_csv(subject_path)
            subject_id = filename.split("_")[0]  # Or use regex for more robust parsing
            df["participant_id"] = subject_id
            label_dfs.append(df)
    
    if label_dfs:
        WMGM_distribution_df[label] = pd.concat(label_dfs, ignore_index=True)
    else:
        print(f"Warning: No CSV files found for label {label}")
        WMGM_distribution_df[label] = pd.DataFrame()

# Add age and sex to DTI metric dataframe
for label in WMGM_distribution_df:
    WMGM_distribution_df[label] = WMGM_distribution_df[label].merge(include_t2star_subjects, on="participant_id", how="left")
                                                                    

In [60]:
# Save combined WM, GM and SC CSA dataframes to csv files

df_WM = WMGM_distribution_df['WM']
WMGM_distribution_df['WM'].to_csv('../tables/WMGM_distribution/WM/WM_CSA.csv', index=False)

df_GM = WMGM_distribution_df['GM']
WMGM_distribution_df['GM'].to_csv('../tables/WMGM_distribution/GM/GM_CSA.csv', index=False)

df_SC = WMGM_distribution_df['SC']
WMGM_distribution_df['SC'].to_csv('../tables/WMGM_distribution/SC/SC_CSA.csv', index=False)

In [61]:
df_GM

Unnamed: 0,Timestamp,SCT Version,Filename,Slice (I->S),VertLevel,DistancePMJ,MEAN(area),STD(area),MEAN(angle_AP),STD(angle_AP),...,MEAN(solidity),STD(solidity),SUM(length),participant_id,age,sex,group,scan_series,height,weight
0,2025-09-24 20:27:11,7.0,/Users/samuellestonge/Documents/datasets/phila...,5:7,5,,15.666670,0.311805,0.0,0.0,...,0.433810,0.025331,15.0,sub-137,10,M,control,incomplete,,
1,2025-09-24 20:27:11,7.0,/Users/samuellestonge/Documents/datasets/phila...,8:9,4,,15.375003,0.625000,0.0,0.0,...,0.426512,0.014747,10.0,sub-137,10,M,control,incomplete,,
2,2025-09-24 20:27:11,7.0,/Users/samuellestonge/Documents/datasets/phila...,10:12,3,,14.333336,0.824958,0.0,0.0,...,0.441475,0.019234,15.0,sub-137,10,M,control,incomplete,,
3,2025-09-24 20:27:11,7.0,/Users/samuellestonge/Documents/datasets/phila...,13:14,2,,12.375002,0.125000,0.0,0.0,...,0.399008,0.011414,10.0,sub-137,10,M,control,incomplete,,
4,2025-09-24 20:27:13,7.0,/Users/samuellestonge/Documents/datasets/phila...,5:14,2:5,,14.550003,1.336039,0.0,0.0,...,0.427690,0.024619,50.0,sub-137,10,M,control,incomplete,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2025-09-24 20:22:24,7.0,/Users/samuellestonge/Documents/datasets/phila...,3:5,5,,16.250000,0.889757,0.0,0.0,...,0.425051,0.026147,15.0,sub-112,16,F,control,complete,1.676,61.236
363,2025-09-24 20:22:24,7.0,/Users/samuellestonge/Documents/datasets/phila...,6:8,4,,16.083333,0.716860,0.0,0.0,...,0.404033,0.002977,15.0,sub-112,16,F,control,complete,1.676,61.236
364,2025-09-24 20:22:24,7.0,/Users/samuellestonge/Documents/datasets/phila...,9:11,3,,14.666667,1.389444,0.0,0.0,...,0.422542,0.010833,15.0,sub-112,16,F,control,complete,1.676,61.236
365,2025-09-24 20:22:24,7.0,/Users/samuellestonge/Documents/datasets/phila...,12:14,2,,11.833333,1.027402,0.0,0.0,...,0.412532,0.011195,15.0,sub-112,16,F,control,complete,1.676,61.236


# Merge WM and GM dataframes and compute WM/GM ratio

In [62]:
df_ratio = df_WM.merge(
    df_GM,
    on=['participant_id', 'VertLevel', 'age', 'sex'],  # adjust keys if needed
    suffixes=('_WM', '_GM')
)

# Compute WM/GM ratio
df_ratio['GM_WM_ratio'] = df_ratio['MEAN(area)_GM'] / df_ratio['MEAN(area)_WM'].values
df_ratio

Unnamed: 0,Timestamp_WM,SCT Version_WM,Filename_WM,Slice (I->S)_WM,VertLevel,DistancePMJ_WM,MEAN(area)_WM,STD(area)_WM,MEAN(angle_AP)_WM,STD(angle_AP)_WM,...,MEAN(orientation)_GM,STD(orientation)_GM,MEAN(solidity)_GM,STD(solidity)_GM,SUM(length)_GM,group_GM,scan_series_GM,height_GM,weight_GM,GM_WM_ratio
0,2025-09-24 20:29:00,7.0,/Users/samuellestonge/Documents/datasets/phila...,3:5,5,,59.500000,0.735980,0.0,0.0,...,5.480735,3.694444,0.410188,0.017214,15.0,control,complete,,,0.274510
1,2025-09-24 20:29:00,7.0,/Users/samuellestonge/Documents/datasets/phila...,6:8,4,,74.166667,24.118055,0.0,0.0,...,1.347337,1.179700,0.417984,0.012845,15.0,control,complete,,,0.212360
2,2025-09-24 20:29:00,7.0,/Users/samuellestonge/Documents/datasets/phila...,9:11,3,,88.666667,23.279178,0.0,0.0,...,2.161853,1.235671,0.397677,0.014541,15.0,control,complete,,,0.145677
3,2025-09-24 20:29:00,7.0,/Users/samuellestonge/Documents/datasets/phila...,12:14,2,,56.000000,2.557668,0.0,0.0,...,5.976770,2.414354,0.425852,0.024160,15.0,control,complete,,,0.215774
4,2025-09-24 20:29:02,7.0,/Users/samuellestonge/Documents/datasets/phila...,3:14,2:5,,69.583333,21.225251,0.0,0.0,...,3.741674,3.108266,0.412925,0.020550,60.0,control,complete,,,0.205090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2025-09-24 20:31:31,7.0,/Users/samuellestonge/Documents/datasets/phila...,0:1,5,,84.812500,26.062500,0.0,0.0,...,2.411908,1.654295,0.409784,0.035337,10.0,control,complete,,,0.172439
363,2025-09-24 20:31:31,7.0,/Users/samuellestonge/Documents/datasets/phila...,2:3,4,,74.125000,29.875000,0.0,0.0,...,0.775330,0.299752,0.454819,0.025542,10.0,control,complete,,,0.195616
364,2025-09-24 20:31:31,7.0,/Users/samuellestonge/Documents/datasets/phila...,4:6,3,,79.625000,22.720448,0.0,0.0,...,2.488560,1.685124,0.402976,0.024777,15.0,control,complete,,,0.151753
365,2025-09-24 20:31:31,7.0,/Users/samuellestonge/Documents/datasets/phila...,7:9,2,,52.000000,0.889757,0.0,0.0,...,1.399409,1.130690,0.395179,0.008149,15.0,control,complete,,,0.237179


In [71]:
df_ratio['sex'].value_counts()

sex
F    217
M    150
Name: count, dtype: int64

## Plot GM and WM CSA 

In [None]:
def plot_GMWM_ratio(df):
    
    df = df.copy()

    print(df['sex'].unique())
    print(df['sex'].dtype)

    # Vertebral levels to plot
    vert_levels = ['2', '3', '4', '5', '2:5']

    # Define color of markers 
    color = "#222020"

    # Color for confidence interval
    ci_color = 'rgba(34, 32, 32, 0.35)' 

    fig = make_subplots(
        rows=5,
        cols=1,
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=0.05,
        horizontal_spacing=0.1
    )

    for row_idx, vert in enumerate(vert_levels, start=1):

        # Filder data according to the VertLevel
        data = df[df['VertLevel'] == vert].copy()

        x_range = np.linspace(data['age'].min(), data['age'].max(), 100)
        pred_df = pd.DataFrame({'age': x_range})

        ols_model = smf.ols(formula=f'GM_WM_ratio ~ age', data=data)
        ols_results = ols_model.fit()

        # Print OLS results
        print(ols_results.summary())

        pred = ols_results.get_prediction(pred_df)
        pred_summary = pred.summary_frame(alpha=0.05)

        y_fit = pred_summary['mean']
        ci_lower = pred_summary['mean_ci_lower']
        ci_upper = pred_summary['mean_ci_upper']

        # Scatter plot
        fig.add_trace(
            go.Scatter(
                x=data['age'],
                y=data['GM_WM_ratio'],
                mode='markers',
                marker=dict(color=color, size=11, symbol='circle', opacity=0.8),
                showlegend=False
            ),
            row=row_idx, col=1
        )

        # Fit line
        fig.add_trace(
            go.Scatter(
                x=x_range,
                y=y_fit,
                mode='lines',
                line=dict(color=color, width=2, dash='solid'),
                showlegend=False
            ),
            row=row_idx, col=1
        )

        # Confidence interval
        fig.add_trace(
            go.Scatter(
                x=np.concatenate([x_range, x_range[::-1]]),
                y=np.concatenate([ci_upper, ci_lower[::-1]]),
                fill='toself',
                fillcolor=ci_color,
                line=dict(color='rgba(255,255,255,0)'),
                hoverinfo='skip',
                showlegend=False,
            ),
            row=row_idx, col=1
        )

        # Add p-value as annotation
        p_value = ols_results.pvalues['age']
        significance = ''
        if p_value < 0.05:
            significance = '*'
            font_color='red'
        else:
            font_color='black'
        fig.add_annotation(
            x=0.7,  # relative within subplot (0–1)
            y=0.9,  # near the top
            xref="x domain",
            yref="y domain",
            text = f'p-Age = {p_value:.3f} <span style="color:{font_color}">{significance}</span>',
            showarrow=False,
            font=dict(family='Arial', size=16, color='black'),
            bgcolor='rgba(255, 255, 255, 0.7)',
            bordercolor='black',
            borderwidth=1,
            borderpad=4,
            row=row_idx, col=1
        )

        fig.update_xaxes(
                title_text='Age',
                title_font=dict(family='Arial', size=20, color='black', weight='bold'),
                tickfont=dict(family='Arial', size=18),
                tickvals=list(range(6, 18)),
                range=[5, 18],
                row=row_idx, col=1
            )
        
        fig.update_yaxes(
                title_text=f'GM / WM ratio in C{vert}',
                title_font=dict(family='Arial', size=20, color='black', weight='bold'),
                tickfont=dict(family='Arial', size=18),
                range=[0, 0.4],
                row=row_idx, col=1
            )

    # Update layout
    fig.update_layout(
        height=1800,
        width=700,
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff"
    )


    fig.show()


In [105]:
plot_GMWM_ratio(df_ratio)

['F' 'M']
object
                            OLS Regression Results                            
Dep. Variable:            GM_WM_ratio   R-squared:                       0.123
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     9.647
Date:                Wed, 01 Oct 2025   Prob (F-statistic):            0.00275
Time:                        11:00:31   Log-Likelihood:                 136.13
No. Observations:                  71   AIC:                            -268.3
Df Residuals:                      69   BIC:                            -263.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2624      0.021   