# WM and GM distribution in the pediatric spinal cord

This jupyter notebook includes scripts to generate figures related to the white matter and gray matter distribution in the pediatric spinal cord.

In [1]:
import os
import pandas as pd
import json
import yaml
import re
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import webbrowser
import statsmodels.formula.api as smf

### Load config file to get path to dataset 

In [2]:
# Load config file
with open('../../config/config_preprocessing.yaml' , 'r') as file:
    config = yaml.safe_load(file)

# Get data path from config file
path_data = config['path_data']

### Get the `participants.tsv` file from the dataset

In [3]:
# Get path to participants.tsv file
participants_tsv = pd.read_csv(os.path.join(path_data, 'participants.tsv'), sep='\t')
participants_tsv

Unnamed: 0,participant_id,age,sex,group,scan_series,height,weight
0,sub-101,17,M,control,complete,1.778004,68.038864
1,sub-102,15,F,control,complete,1.625603,52.163129
2,sub-103,15,M,control,complete,1.651003,54.431091
3,sub-104,15,F,control,complete,1.625603,52.163129
4,sub-105,13,M,control,complete,1.524000,35.381000
...,...,...,...,...,...,...,...
110,sub-214,6,M,control,complete,,
111,sub-215,16,F,control,complete,,
112,sub-216,15,F,control,complete,,
113,sub-217,15,M,control,complete,,


## Dataframe of subjects included in the pipeline analysis

The following dataframe contains only the subjects that were included in this pipeline analysis.

In [4]:
def get_list_of_subjects_to_include(contrast, path_data, missing_data_subjects):
    """
    This function takes an image contrast (T2w, dwi, etc.), a path to a dataset, and a list of subjects with missing data,
    and returns a list of subjects to include in the analysis.

    The dataset needs to be in BIDS format, and the function will look for the participants.tsv file to get the list of subjects.
    The dataset should also contain an `exclude.yml` file that lists subjects to exclude from the analysis.
    """

    # Get the `participants.tsv` file and read it into a dataframe
    participants_tsv = pd.read_csv(os.path.join(path_data, 'participants.tsv'), sep='\t')

    # Get all subject IDs from the participants.tsv
    all_subjects = participants_tsv['participant_id'].tolist()

    # Get list of subjects to exclude from the analysis from the `exclude.yml` file
    with open(os.path.join(path_data, 'exclude.yml'), 'r') as file:
        exclude_yml = yaml.safe_load(file)

    exclude_t2star_key = exclude_yml.get(contrast, []) # Extract subjects under contrast key
    exclude_subjects = sorted(set(re.match(r"(sub-\d+)", entry).group(1) for entry in exclude_t2star_key if re.match(r"(sub-\d+)", entry))) # Extract the subject ID 

    # Add the list of subjects with missing data to the exclude_subjects list
    exclude_subjects.extend(missing_data_subjects)
    
    # Remove duplicates (if any), sort and print the list of subjects to exclude from the analysis
    exclude_subjects = sorted(set(exclude_subjects))
    print(f'subjects to exclude : {exclude_subjects}')

    # Compute the list of subjects to include in the analysis 
    include_subjects = [sub for sub in all_subjects if sub not in exclude_subjects]

    # Convert the list of included subjects to a dataframe
    include_subjects = participants_tsv[participants_tsv['participant_id'].isin(include_subjects)]

    return include_subjects

In [5]:
# List of subjects with missing t2star data or missing T2w rootlet segmentation
missing_t2star_subjects = [
                        "sub-109",
                        "sub-125",
                        "sub-136",
                        "sub-152",
                        "sub-159",
                        "sub-174",
                        "sub-198",
                        "sub-200",
                        "sub-205",
                        "sub-211",
                        "sub-213"]

missing_rootlets_subjects = ["sub-108", "sub-110", "sub-111", "sub-121", "sub-127", "sub-133", "sub-136", "sub-141", "sub-150", 
                             "sub-159", "sub-160", "sub-161", "sub-163", "sub-165", "sub-167", "sub-168", "sub-169", 
                             "sub-170", "sub-177", "sub-179", "sub-186", "sub-190", "sub-191", "sub-193", "sub-198", 
                             "sub-203", "sub-208", "sub-209", "sub-210", "sub-211", "sub-213", "sub-214"]

# Combine both lists of missing subjects
missing_subjects = missing_t2star_subjects + missing_rootlets_subjects
print(missing_subjects)

# Get the list of subjects to include in the analysis
include_t2star_subjects = get_list_of_subjects_to_include('t2starw', path_data, missing_subjects)
include_t2star_subjects.to_csv(os.path.join('../tables/WMGM_distribution/include_t2star_subjects.csv'), sep='\t', index=False)

print(include_t2star_subjects.shape[0], "subjects to include in the analysis")
print(f"\n list of subjects to include : \n {include_t2star_subjects['participant_id'].tolist()}")

include_t2star_subjects

['sub-109', 'sub-125', 'sub-136', 'sub-152', 'sub-159', 'sub-174', 'sub-198', 'sub-200', 'sub-205', 'sub-211', 'sub-213', 'sub-108', 'sub-110', 'sub-111', 'sub-121', 'sub-127', 'sub-133', 'sub-136', 'sub-141', 'sub-150', 'sub-159', 'sub-160', 'sub-161', 'sub-163', 'sub-165', 'sub-167', 'sub-168', 'sub-169', 'sub-170', 'sub-177', 'sub-179', 'sub-186', 'sub-190', 'sub-191', 'sub-193', 'sub-198', 'sub-203', 'sub-208', 'sub-209', 'sub-210', 'sub-211', 'sub-213', 'sub-214']
subjects to exclude : ['sub-106', 'sub-107', 'sub-108', 'sub-109', 'sub-110', 'sub-111', 'sub-120', 'sub-121', 'sub-124', 'sub-125', 'sub-127', 'sub-133', 'sub-136', 'sub-139', 'sub-141', 'sub-144', 'sub-150', 'sub-152', 'sub-154', 'sub-159', 'sub-160', 'sub-161', 'sub-163', 'sub-165', 'sub-167', 'sub-168', 'sub-169', 'sub-170', 'sub-171', 'sub-172', 'sub-174', 'sub-177', 'sub-179', 'sub-181', 'sub-186', 'sub-188', 'sub-189', 'sub-190', 'sub-191', 'sub-193', 'sub-194', 'sub-196', 'sub-198', 'sub-199', 'sub-200', 'sub-203

Unnamed: 0,participant_id,age,sex,group,scan_series,height,weight
0,sub-101,17,M,control,complete,1.778004,68.038864
1,sub-102,15,F,control,complete,1.625603,52.163129
2,sub-103,15,M,control,complete,1.651003,54.431091
3,sub-104,15,F,control,complete,1.625603,52.163129
4,sub-105,13,M,control,complete,1.524,35.381
11,sub-112,16,F,control,complete,1.676,61.236
12,sub-113,14,F,control,complete,1.676403,72.574788
13,sub-114,14,F,control,complete,1.574803,46.266428
14,sub-115,13,F,control,complete,1.753,72.576
15,sub-116,15,F,control,complete,1.676403,61.234978


## Plot demographics

This function plots the age and sex distribution of the subjects included in a pipeline analysis, according to the include list generated above. 

In [6]:
def plot_demographics(df):
    """
    This function plots the demographic information of participants, given a dataframe with the list of subjects to include in the analysis.
    """

    # Sort by sex
    df_M = df[df['sex'] == 'M']
    df_F = df[df['sex'] == 'F']

    # Round down age to nearest month 
    df['age'] = np.floor(df['age']) 

    # Create subplot
    fig = make_subplots(rows=1, cols=1)

    # Add histogram for female subjects
    fig.add_trace(go.Histogram(
        x=df_F['age'], 
        name='F', 
        marker=dict(color= "#D19D88"),
        opacity=1.0,
        legendgroup='F',
        ),
        row=1, col=1
    )

    # Add histogram for male subjects
    fig.add_trace(go.Histogram(
        x=df_M['age'], 
        name='M', 
        
        marker=dict(color="#5C8EA1"),
        opacity=1.0,
        legendgroup='M',
        ), 
        row=1, col=1
    )

    # Define age tick range
    tick_vals = list(range(6, 18)) 

    # Update layout
    fig.update_layout(
        width=900,
        height=500,
        font=dict(family='Arial', size=20, color='black'), 
        legend=dict(
            orientation="h", 
            yanchor="bottom", 
            y=1.0, 
            xanchor="center",  
            x=0.5,
        ),
        xaxis=dict(
            range=[5, 18],  # Set x-axis range from 6 to 17
        ),
        plot_bgcolor='white',
        barmode='stack',
        bargap=0.3,  
        xaxis_title='Age (years)',
        xaxis_title_font=dict(family='Arial', size=20, weight='bold'),
        yaxis_title='Number of Subjects',
        yaxis_title_font=dict(family='Arial', size=20, weight='bold'),
        xaxis_title_standoff=50, 
    )

    fig.update_xaxes(
        tickmode='array',
        tickvals=tick_vals,
        showgrid=False,
        gridwidth=1
    )

    fig.update_yaxes(
        showgrid=True,             # Horizontal grid lines
        gridcolor='lightgrey',
        gridwidth=1
    )

    # Set bin size to 1 year
    fig.update_traces(xbins=dict(size=1))

    fig.show()

In [7]:
# Plot demographics for included subjects in DWI analysis
plot_demographics(include_t2star_subjects)

# Create a dataframe for GM, WM and SC CSA (all subjects combined)

In [8]:
# Path to WM, GM and SC CSA (one file per subject)
CSA_base_folder = "../tables/WMGM_distribution/"
labels = ['WM', 'GM', 'SC']

WMGM_distribution_df = {}

for label in labels:
    label_folder = os.path.join(CSA_base_folder, label)
    label_dfs = []

    if not os.path.exists(label_folder):
        print(f"Folder not found: {label_folder}")
        continue
    
    for filename in os.listdir(label_folder):
        if filename.endswith(".csv"):
            if not any(sub in filename for sub in include_t2star_subjects['participant_id'].tolist()):
                continue  # Skip subjects not in the include list
            subject_path = os.path.join(label_folder, filename)
            df = pd.read_csv(subject_path)
            subject_id = filename.split("_")[0]  # Or use regex for more robust parsing
            df["participant_id"] = subject_id
            label_dfs.append(df)
    
    if label_dfs:
        WMGM_distribution_df[label] = pd.concat(label_dfs, ignore_index=True)
    else:
        print(f"Warning: No CSV files found for label {label}")
        WMGM_distribution_df[label] = pd.DataFrame()

# Add age and sex to DTI metric dataframe
for label in WMGM_distribution_df:
    WMGM_distribution_df[label] = WMGM_distribution_df[label].merge(include_t2star_subjects, on="participant_id", how="left")
                                                                    

In [9]:
# Save combined WM, GM and SC CSA dataframes to csv files

df_WM = WMGM_distribution_df['WM']
#WMGM_distribution_df['WM'].to_csv('../tables/WMGM_distribution/WM/WM_CSA.csv', index=False)

df_GM = WMGM_distribution_df['GM']
#WMGM_distribution_df['GM'].to_csv('../tables/WMGM_distribution/GM/GM_CSA.csv', index=False)

df_SC = WMGM_distribution_df['SC']
#WMGM_distribution_df['SC'].to_csv('../tables/WMGM_distribution/SC/SC_CSA.csv', index=False)

In [10]:
df_SC

Unnamed: 0,Timestamp,SCT Version,Filename,Slice (I->S),VertLevel,DistancePMJ,MEAN(area),STD(area),MEAN(angle_AP),STD(angle_AP),...,MEAN(solidity),STD(solidity),SUM(length),participant_id,age,sex,group,scan_series,height,weight
0,2025-10-21 18:57:50,7.1,/Users/samuellestonge/Documents/datasets/phila...,158:172,7,,69.370090,4.483451,-1.516983,0.175995,...,0.973712,0.014315,12.322181,sub-101,17,M,control,complete,1.778004,68.038864
1,2025-10-21 18:57:50,7.1,/Users/samuellestonge/Documents/datasets/phila...,176:189,6,,78.822287,3.277405,-1.996028,0.057155,...,0.981792,0.006869,11.405873,sub-101,17,M,control,complete,1.778004,68.038864
2,2025-10-21 18:57:50,7.1,/Users/samuellestonge/Documents/datasets/phila...,195:206,5,,77.541091,2.937006,-2.002646,0.047445,...,0.980924,0.009727,9.685968,sub-101,17,M,control,complete,1.778004,68.038864
3,2025-10-21 18:57:50,7.1,/Users/samuellestonge/Documents/datasets/phila...,214:222,4,,72.110930,1.216274,-1.535908,0.104047,...,0.976000,0.003300,7.213688,sub-101,17,M,control,complete,1.778004,68.038864
4,2025-10-21 18:57:50,7.1,/Users/samuellestonge/Documents/datasets/phila...,231:244,3,,72.008648,4.509739,-0.698160,0.129459,...,0.964519,0.004111,5.602962,sub-101,17,M,control,complete,1.778004,68.038864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2025-10-21 19:05:43,7.1,/Users/samuellestonge/Documents/datasets/phila...,108:117,6,,76.094338,1.757053,-1.980506,0.053378,...,0.973451,0.012393,8.142615,sub-192,17,F,control,complete,,
336,2025-10-21 19:05:43,7.1,/Users/samuellestonge/Documents/datasets/phila...,126:136,5,,76.986191,1.548856,-1.741619,0.023284,...,0.970059,0.006990,8.851396,sub-192,17,F,control,complete,,
337,2025-10-21 19:05:43,7.1,/Users/samuellestonge/Documents/datasets/phila...,146:154,4,,68.083830,0.949117,-1.711499,0.011117,...,0.975600,0.005701,7.204270,sub-192,17,F,control,complete,,
338,2025-10-21 19:05:43,7.1,/Users/samuellestonge/Documents/datasets/phila...,164:181,3,,69.377841,1.676102,-1.967309,0.093355,...,0.968500,0.006807,14.493476,sub-192,17,F,control,complete,,


# Merge WM and GM dataframes and compute WM/GM ratio, GM/SC ratio and WM/SC ratio

In [11]:
# Merge WM and GM dataframes
df_GM_WM = df_WM.merge(
    df_GM,
    on=['participant_id', 'VertLevel', 'age', 'sex'], 
    suffixes=('_WM', '_GM')
)

# Add "_SC" suffix to df_SC columns except for the merge keys
df_SC = df_SC.rename(columns={col: f"{col}_SC" for col in df_SC.columns if col not in ['participant_id', 'VertLevel', 'age', 'sex']})

# Merge SC dataframe to the dataframe with WM and GM
df_GM_WM_SC = df_GM_WM.merge(
    df_SC, 
    on=['participant_id', 'VertLevel', 'age', 'sex']
)

df_GM_WM_SC.columns

Index(['Timestamp_WM', 'SCT Version_WM', 'Filename_WM', 'Slice (I->S)_WM',
       'VertLevel', 'DistancePMJ_WM', 'MEAN(area)_WM', 'STD(area)_WM',
       'MEAN(angle_AP)_WM', 'STD(angle_AP)_WM', 'MEAN(angle_RL)_WM',
       'STD(angle_RL)_WM', 'MEAN(diameter_AP)_WM', 'STD(diameter_AP)_WM',
       'MEAN(diameter_RL)_WM', 'STD(diameter_RL)_WM', 'MEAN(eccentricity)_WM',
       'STD(eccentricity)_WM', 'MEAN(orientation)_WM', 'STD(orientation)_WM',
       'MEAN(solidity)_WM', 'STD(solidity)_WM', 'SUM(length)_WM',
       'participant_id', 'age', 'sex', 'group_WM', 'scan_series_WM',
       'height_WM', 'weight_WM', 'Timestamp_GM', 'SCT Version_GM',
       'Filename_GM', 'Slice (I->S)_GM', 'DistancePMJ_GM', 'MEAN(area)_GM',
       'STD(area)_GM', 'MEAN(angle_AP)_GM', 'STD(angle_AP)_GM',
       'MEAN(angle_RL)_GM', 'STD(angle_RL)_GM', 'MEAN(diameter_AP)_GM',
       'STD(diameter_AP)_GM', 'MEAN(diameter_RL)_GM', 'STD(diameter_RL)_GM',
       'MEAN(eccentricity)_GM', 'STD(eccentricity)_GM', 'MEAN(o

In [12]:
df_GM_WM_SC

Unnamed: 0,Timestamp_WM,SCT Version_WM,Filename_WM,Slice (I->S)_WM,VertLevel,DistancePMJ_WM,MEAN(area)_WM,STD(area)_WM,MEAN(angle_AP)_WM,STD(angle_AP)_WM,...,STD(eccentricity)_SC,MEAN(orientation)_SC,STD(orientation)_SC,MEAN(solidity)_SC,STD(solidity)_SC,SUM(length)_SC,group_SC,scan_series_SC,height_SC,weight_SC
0,2025-10-21 19:01:41,7.1,/Users/samuellestonge/Documents/datasets/phila...,114:124,7,,57.359927,16.835112,-1.554322,0.174290,...,0.013335,2.195348,0.976706,0.968445,0.009077,8.973750,control,complete,,
1,2025-10-21 19:01:41,7.1,/Users/samuellestonge/Documents/datasets/phila...,130:138,6,,54.304006,1.747918,-0.894976,0.084020,...,0.012280,1.992415,1.013353,0.968386,0.006910,7.461898,control,complete,,
2,2025-10-21 19:01:41,7.1,/Users/samuellestonge/Documents/datasets/phila...,147:151,5,,53.670129,0.900087,-0.573547,0.014044,...,0.008633,1.534576,0.476434,0.961524,0.011130,4.094793,control,complete,,
3,2025-10-21 19:01:41,7.1,/Users/samuellestonge/Documents/datasets/phila...,164:168,4,,53.331198,1.623985,-0.623219,0.022287,...,0.017500,2.665582,0.564826,0.969022,0.009649,4.036784,control,complete,,
4,2025-10-21 19:01:41,7.1,/Users/samuellestonge/Documents/datasets/phila...,175:186,3,,52.287060,1.302376,-1.017254,0.129980,...,0.009893,3.820344,1.223823,0.971038,0.009334,9.652838,control,complete,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2025-10-21 19:03:03,7.1,/Users/samuellestonge/Documents/datasets/phila...,136:145,6,,54.415426,17.285552,0.814531,0.323533,...,0.015010,2.203333,1.491076,0.969916,0.015850,8.529766,control,complete,,
336,2025-10-21 19:03:03,7.1,/Users/samuellestonge/Documents/datasets/phila...,154:161,5,,45.239229,3.115101,2.185994,0.110701,...,0.026795,2.563631,0.738559,0.969426,0.012983,6.550178,control,complete,,
337,2025-10-21 19:03:03,7.1,/Users/samuellestonge/Documents/datasets/phila...,169:175,4,,74.044074,31.072072,2.489493,0.014765,...,0.012494,1.312104,0.873537,0.965048,0.009042,5.638095,control,complete,,
338,2025-10-21 19:03:03,7.1,/Users/samuellestonge/Documents/datasets/phila...,180:195,3,,47.305774,1.655541,1.900354,0.302726,...,0.007425,0.643052,0.501306,0.976514,0.007322,12.813655,control,complete,,


In [13]:
# Compute WM/GM ratio
df_GM_WM_SC['GM_WM_ratio'] = df_GM_WM_SC['MEAN(area)_GM'] / df_GM_WM_SC['MEAN(area)_WM'].values

# Compute GM/SC ratio
df_GM_WM_SC['GM_SC_ratio'] = df_GM_WM_SC['MEAN(area)_GM'] / df_GM_WM_SC['MEAN(area)_SC'].values

# Compute WM/SC ratio
df_GM_WM_SC['WM_SC_ratio'] = df_GM_WM_SC['MEAN(area)_WM'] / df_GM_WM_SC['MEAN(area)_SC'].values

## Plot GM, WM and SC CSA with age using spinal levels

In [14]:
def plot_CSA_all_labels(df):
    """
    Plots CSA (WM, GM, SC) across vertebral levels as a function of age.
    Fits OLS regressions for each label and displays confidence intervals and p-values.

    Args:
        df (pd.DataFrame): DataFrame containing columns:
            'age', 'sex', 'VertLevel', and 'MEAN(area)_WM', 'MEAN(area)_GM', 'MEAN(area)_SC'.
    """

    df = df.copy()

    vert_levels = ['3', '4', '5', '6', '7', '3:7']
    labels = ['WM', 'GM', 'SC']

    colors = {
    'WM': 'rgba(31, 119, 180, 1)',   # blue
    'GM': 'rgba(255, 127, 14, 1)',   # orange
    'SC': 'rgba(44, 160, 44, 1)',    # green
    }
    
    ci_colors = {
        'WM': 'rgba(31, 119, 180, 0.2)',
        'GM': 'rgba(255, 127, 14, 0.2)',
        'SC': 'rgba(44, 160, 44, 0.2)',
    }

    fig = make_subplots(
        rows=len(vert_levels),
        cols=1,
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=0.03,
        horizontal_spacing=0.1
    )

    for row_idx, vert in enumerate(vert_levels, start=1):
        data = df[df['VertLevel'] == vert].copy()
        if data.empty:
            continue

        x_range = np.linspace(data['age'].min(), data['age'].max(), 100)

        # Get the metric column name for each label
        for label in labels:
            metric_col = f'MEAN(area)_{label}'
            if metric_col not in data.columns:
                continue

            # Fit OLS model
            ols_model = smf.ols(formula=f'Q("{metric_col}") ~ age', data=data).fit()
            pred_df = pd.DataFrame({'age': x_range})
            pred = ols_model.get_prediction(pred_df).summary_frame(alpha=0.05)

            y_fit = pred['mean']
            ci_lower = pred['mean_ci_lower']
            ci_upper = pred['mean_ci_upper']

            # Scatter data
            fig.add_trace(
                go.Scatter(
                    x=data['age'],
                    y=data[metric_col],
                    mode='markers',
                    marker=dict(color=colors[label], size=9, opacity=0.6),
                    name=f'{label} (C{vert})',
                    showlegend=(row_idx == 1)  # show legend only for first subplot
                ),
                row=row_idx, col=1
            )

            # Regression line
            fig.add_trace(
                go.Scatter(
                    x=x_range,
                    y=y_fit,
                    mode='lines',
                    line=dict(color=colors[label], width=2),
                    name=f'{label} fit',
                    showlegend=False
                ),
                row=row_idx, col=1
            )

            # Confidence interval
            fig.add_trace(
                go.Scatter(
                    x=np.concatenate([x_range, x_range[::-1]]),
                    y=np.concatenate([ci_upper, ci_lower[::-1]]),
                    fill='toself',
                    fillcolor=ci_colors[label],
                    line=dict(color='rgba(255,255,255,0)'),
                    hoverinfo='skip',
                    showlegend=False
                ),
                row=row_idx, col=1
            )

            # Store p-values for each label 
            pvals = {}
            for label in labels:
                metric_col = f'MEAN(area)_{label}'
                if metric_col not in data.columns:
                    continue

                ols_model = smf.ols(formula=f'Q("{metric_col}") ~ age', data=data).fit()
                pvals[label] = ols_model.pvalues['age']

            # Build vertical text (stacked)
            text_lines = []
            for label in labels:
                p_value = pvals[label]
                significance = '*' if p_value < 0.05 else ''
                color = 'red' if p_value < 0.05 else 'black'
                text_lines.append(f"<span style='color:{color};'>p-{label} = {p_value:.3f}{significance}</span>")

            # Combine with HTML line breaks for vertical layout
            text_combined = "<br>".join(text_lines)

            # Add annotation box with all p-values
            fig.add_annotation(
            x=0.1,  
            y=1.0,  
            xref="x domain",
            yref="y domain",
            text=text_combined,
            align='left',  # align text within box
            showarrow=False,
            font=dict(size=12, family='Arial', color='black'),
            bgcolor='rgba(255,255,255,0.75)',
            bordercolor='black',
            borderwidth=1.5,
            borderpad=6,
            row=row_idx, col=1
        )


        # Axes
        fig.update_xaxes(
            title_text='Age',
            title_font=dict(size=18, family='Arial', color='black', weight='bold'),
            tickfont=dict(size=16, family='Arial', color='black'),
            tickvals=list(range(6, 18)),
            range=[5, 18],  
            row=row_idx, col=1
        )

        fig.update_yaxes(
            title_text=f'CSA in C{vert} (mm²)',
            title_font=dict(size=20, family='Arial', color='black', weight='bold'),
            tickfont=dict(size=16, family='Arial', color='black'),
            #range=[0, 20],
            row=row_idx, col=1
        )

    # Layout
    fig.update_layout(
        height=2000,
        width=600,
        plot_bgcolor='white',
        paper_bgcolor='white',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
            font=dict(size=14)
        )
    )

    fig.show()


In [15]:
plot_CSA_all_labels(df_GM_WM_SC)

## Plot ratios between GM, WM and SC CSA using spinal levels

In [33]:
def plot_ratio(df, label_1, label_2):

    """
    This function plots the ratio between two metrics (e.g., GM and WM) across different vertebral levels.

    Args:
        df (pd.DataFrame): DataFrame containing the data to plot. Must include columns 'age', 'sex', 'VertLevel', and the ratio column. 
        The ratio column should be named as '{label_1}_{label_2}_ratio'.
        label_1 (str): The first label in the ratio (e.g., 'GM').
        label_2 (str): The second label in the ratio (e.g., 'WM').
    """
    
    df = df.copy()

    # Vertebral levels to plot
    vert_levels = ['3', '4', '5', '6', '7', '3:7']

    # Define color of markers 
    color = "#222020"

    # Color for confidence interval
    ci_color = 'rgba(34, 32, 32, 0.35)' 

    fig = make_subplots(
        rows=6,
        cols=1,
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=0.05,
        horizontal_spacing=0.1
    )

    for row_idx, vert in enumerate(vert_levels, start=1):

        # Filder data according to the VertLevel
        data = df[df['VertLevel'] == vert].copy()

        print(data)

        x_range = np.linspace(data['age'].min(), data['age'].max(), 100)
        pred_df = pd.DataFrame({'age': x_range})

        ratio_name = f'{label_1}_{label_2}_ratio'
        ols_formula = f'{ratio_name} ~ age'

        ols_model = smf.ols(formula=ols_formula, data=data)
        ols_results = ols_model.fit()

        # Print OLS results
        print(ols_results.summary())

        pred = ols_results.get_prediction(pred_df)
        pred_summary = pred.summary_frame(alpha=0.05)

        y_fit = pred_summary['mean']
        ci_lower = pred_summary['mean_ci_lower']
        ci_upper = pred_summary['mean_ci_upper']

        # Scatter plot
        fig.add_trace(
            go.Scatter(
                x=data['age'],
                y=data[ratio_name],
                mode='markers',
                marker=dict(color=color, size=11, symbol='circle', opacity=0.8),
                showlegend=False
            ),
            row=row_idx, col=1
        )

        # Fit line
        fig.add_trace(
            go.Scatter(
                x=x_range,
                y=y_fit,
                mode='lines',
                line=dict(color=color, width=2, dash='solid'),
                showlegend=False
            ),
            row=row_idx, col=1
        )

        # Confidence interval
        fig.add_trace(
            go.Scatter(
                x=np.concatenate([x_range, x_range[::-1]]),
                y=np.concatenate([ci_upper, ci_lower[::-1]]),
                fill='toself',
                fillcolor=ci_color,
                line=dict(color='rgba(255,255,255,0)'),
                hoverinfo='skip',
                showlegend=False,
            ),
            row=row_idx, col=1
        )

        # Add p-value as annotation
        p_value = ols_results.pvalues['age']
        significance = ''
        if p_value < 0.05:
            significance = '*'
            font_color='red'
        else:
            font_color='black'
            
        fig.add_annotation(
            x=0.85,  # relative within subplot (0–1)
            y=0.95,  # near the top
            xref="x domain",
            yref="y domain",
            text = f'p-Age = {p_value:.3f} <span style="color:{font_color}">{significance}</span>',
            showarrow=False,
            font=dict(family='Arial', size=16, color='black'),
            bgcolor='rgba(255, 255, 255, 0.7)',
            bordercolor='black',
            borderwidth=1,
            borderpad=4,
            row=row_idx, col=1
        )

        fig.update_xaxes(
                title_text='Age',
                title_font=dict(family='Arial', size=20, color='black', weight='bold'),
                tickfont=dict(family='Arial', size=18),
                tickvals=list(range(6, 18)),
                range=[5, 18],
                row=row_idx, col=1
            )
        
        fig.update_yaxes(
                title_text=f'{label_1} / {label_2} ratio in C{vert}',
                title_font=dict(family='Arial', size=20, color='black', weight='bold'),
                tickfont=dict(family='Arial', size=18),
                range=[0.8, 1.2],
                row=row_idx, col=1
            )

    # Update layout
    fig.update_layout(
        height=1800,
        width=600,
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff"
    )


    fig.show()

## Plot GM/WM ratio

In [21]:
plot_ratio(df_GM_WM_SC, 'GM', 'WM')

            Timestamp_WM  SCT Version_WM  \
4    2025-10-21 19:01:41             7.1   
10   2025-10-21 18:58:47             7.1   
16   2025-10-21 18:58:03             7.1   
22   2025-10-21 19:00:55             7.1   
28   2025-10-21 19:02:29             7.1   
34   2025-10-21 19:00:13             7.1   
40   2025-10-21 19:05:00             7.1   
46   2025-10-21 19:00:18             7.1   
52   2025-10-21 19:02:22             7.1   
58   2025-10-21 19:03:05             7.1   
64   2025-10-21 19:04:30             7.1   
70   2025-10-21 19:04:25             7.1   
76   2025-10-21 19:01:36             7.1   
82   2025-10-21 18:59:30             7.1   
88   2025-10-21 19:01:39             7.1   
94   2025-10-21 19:01:47             7.1   
100  2025-10-21 19:07:33             7.1   
106  2025-10-21 19:04:35             7.1   
112  2025-10-21 18:58:03             7.1   
118  2025-10-21 19:05:14             7.1   
124  2025-10-21 18:59:30             7.1   
130  2025-10-21 18:58:04        

## Plot GM/SC ratio

In [32]:
plot_ratio(df_GM_WM_SC, 'GM', 'SC')

            Timestamp_WM  SCT Version_WM  \
4    2025-10-21 19:01:41             7.1   
10   2025-10-21 18:58:47             7.1   
16   2025-10-21 18:58:03             7.1   
22   2025-10-21 19:00:55             7.1   
28   2025-10-21 19:02:29             7.1   
34   2025-10-21 19:00:13             7.1   
40   2025-10-21 19:05:00             7.1   
46   2025-10-21 19:00:18             7.1   
52   2025-10-21 19:02:22             7.1   
58   2025-10-21 19:03:05             7.1   
64   2025-10-21 19:04:30             7.1   
70   2025-10-21 19:04:25             7.1   
76   2025-10-21 19:01:36             7.1   
82   2025-10-21 18:59:30             7.1   
88   2025-10-21 19:01:39             7.1   
94   2025-10-21 19:01:47             7.1   
100  2025-10-21 19:07:33             7.1   
106  2025-10-21 19:04:35             7.1   
112  2025-10-21 18:58:03             7.1   
118  2025-10-21 19:05:14             7.1   
124  2025-10-21 18:59:30             7.1   
130  2025-10-21 18:58:04        

## Plot WM/SC ratio

In [34]:
plot_ratio(df_GM_WM_SC, 'WM', 'SC')

            Timestamp_WM  SCT Version_WM  \
4    2025-10-21 19:01:41             7.1   
10   2025-10-21 18:58:47             7.1   
16   2025-10-21 18:58:03             7.1   
22   2025-10-21 19:00:55             7.1   
28   2025-10-21 19:02:29             7.1   
34   2025-10-21 19:00:13             7.1   
40   2025-10-21 19:05:00             7.1   
46   2025-10-21 19:00:18             7.1   
52   2025-10-21 19:02:22             7.1   
58   2025-10-21 19:03:05             7.1   
64   2025-10-21 19:04:30             7.1   
70   2025-10-21 19:04:25             7.1   
76   2025-10-21 19:01:36             7.1   
82   2025-10-21 18:59:30             7.1   
88   2025-10-21 19:01:39             7.1   
94   2025-10-21 19:01:47             7.1   
100  2025-10-21 19:07:33             7.1   
106  2025-10-21 19:04:35             7.1   
112  2025-10-21 18:58:03             7.1   
118  2025-10-21 19:05:14             7.1   
124  2025-10-21 18:59:30             7.1   
130  2025-10-21 18:58:04        