## Reference vs Moving Site Scatter

Set the paths for the reference site (HC only) and the moving site (HC + patients) in the cell below, then run it to build the comparison plots for the bundle present in those files.


In [None]:
# Default MAINFOLDER (override if needed)
MAINFOLDER = 'RESULTS/DISTRIBUTION_ANALYSIS'
os.makedirs(MAINFOLDER, exist_ok=True)


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
from matplotlib import patches as mpatches

HC_LABEL = 'HC'
RESULTS_ROOT = 'RESULTS/DISTRIBUTION_ANALYSIS'
if 'MAINFOLDER' in globals():
    RESULTS_ROOT = MAINFOLDER


def _half_marker(fill):
    marker = MarkerStyle('o', fillstyle=fill)
    return marker.get_path().transformed(marker.get_transform())


LEFT_HALF_MARKER = _half_marker('left')
RIGHT_HALF_MARKER = _half_marker('right')


def _kde(values, y_grid, bandwidth=None):
    arr = np.asarray(values.dropna(), dtype=float)
    if arr.size == 0:
        return np.zeros_like(y_grid)

    if arr.size == 1:
        if bandwidth is None:
            span = max(abs(arr[0]), 1.0)
            bandwidth = max(span * 0.1, 1e-3)
    else:
        if bandwidth is None:
            std = np.std(arr, ddof=1)
            if not np.isfinite(std) or std == 0:
                std = np.std(arr)
            if not np.isfinite(std) or std == 0:
                std = 1.0
            bandwidth = 1.06 * std * arr.size ** (-1 / 5)

    if not np.isfinite(bandwidth) or bandwidth <= 0:
        bandwidth = 1e-3

    diff = (y_grid[:, None] - arr[None, :]) / bandwidth
    norm = 1 / (np.sqrt(2 * np.pi) * bandwidth * arr.size)
    return norm * np.exp(-0.5 * diff ** 2).sum(axis=1)


def _add_density(ax, y, density, *, facecolor, edgecolor, hatch=None, alpha=0.3, linewidth=1.2):
    if density.sum() == 0:
        return
    ax.fill_betweenx(y, 0, density, facecolor=facecolor, edgecolor=edgecolor,
                     hatch=hatch, alpha=alpha, linewidth=0.8 if hatch else 0.0)
    ax.plot(density, y, color=edgecolor, linewidth=linewidth)



In [None]:
REFERENCE_FILE = 'PATH_TO_REFERENCE_FILE'
MOVING_FILE = 'PATH_TO_MOVING_FILE'
OUTPUT_DIR = os.path.join(RESULTS_ROOT, 'COMPILATION_SITE_SCATTERS_SINGLE')
os.makedirs(OUTPUT_DIR, exist_ok=True)

if 'PATH_TO_REFERENCE_FILE' in REFERENCE_FILE or 'PATH_TO_MOVING_FILE' in MOVING_FILE:
    raise ValueError('Update REFERENCE_FILE and MOVING_FILE before running this cell.')

reference_df = pd.read_csv(REFERENCE_FILE)
moving_df = pd.read_csv(MOVING_FILE)

if 'metric_bundle' in reference_df.columns:
    bundle_value = reference_df['metric_bundle'].dropna().iloc[0]
    reference_df = reference_df[reference_df['metric_bundle'] == bundle_value]
    moving_df = moving_df[moving_df['metric_bundle'] == bundle_value]
else:
    bundle_value = 'metric_bundle'

metric_name = reference_df['metric'].dropna().iloc[0] if 'metric' in reference_df.columns else 'metric'
ref_site = reference_df['old_site'].dropna().iloc[0] if 'old_site' in reference_df.columns else 'Reference'
moving_site = moving_df['old_site'].dropna().iloc[0] if 'old_site' in moving_df.columns else 'Moving'

moving_patients = moving_df[moving_df['disease'] != HC_LABEL]
moving_hc = moving_df[moving_df['disease'] == HC_LABEL]
ref_hc = reference_df[reference_df['disease'] == HC_LABEL]

patient_labels = moving_patients['disease'].dropna().unique()
if patient_labels.size:
    disease_label = ','.join(sorted(patient_labels))
else:
    disease_label = HC_LABEL

value_specs = (
    ('mean', 'Mean'),
    ('mean_no_cov', 'Mean (no covariates)'),
)

for value_col, ylabel in value_specs:
    if value_col not in reference_df.columns or value_col not in moving_df.columns:
        continue

    ref_values = reference_df[value_col].dropna()
    moving_values = moving_df[value_col].dropna()
    if ref_values.empty and moving_values.empty:
        continue

    combined = pd.concat([ref_values, moving_values], axis=0)
    y_min, y_max = combined.min(), combined.max()
    if not np.isfinite(y_min) or not np.isfinite(y_max):
        continue
    if y_min == y_max:
        span = max(abs(y_min), 1.0)
        y_min -= 0.1 * span
        y_max += 0.1 * span
    y_grid = np.linspace(y_min, y_max, 400)

    kde_ref = _kde(ref_values, y_grid)
    kde_move_all = _kde(moving_values, y_grid)
    kde_move_hc = _kde(moving_hc[value_col], y_grid)
    kde_move_patients = _kde(moving_patients[value_col], y_grid)

    fig = plt.figure(figsize=(13, 6))
    gs = fig.add_gridspec(1, 2, width_ratios=[4, 1.3], wspace=0.08)
    ax = fig.add_subplot(gs[0])
    ax_dist = fig.add_subplot(gs[1], sharey=ax)

    ax.scatter(ref_hc['age'], ref_hc[value_col], color='royalblue', alpha=0.6, label=f'{ref_site} HC')
    ax.scatter(moving_hc['age'], moving_hc[value_col], color='mediumseagreen', alpha=0.6, label=f'{moving_site} HC')
    if not moving_patients.empty:
        ax.scatter(moving_patients['age'], moving_patients[value_col], marker=LEFT_HALF_MARKER, s=70,
                   color='mediumseagreen', alpha=0.9)
        ax.scatter(moving_patients['age'], moving_patients[value_col], marker=RIGHT_HALF_MARKER, s=70,
                   color='crimson', alpha=0.9, label=f'{moving_site} patients')

    ax.set_title(f'{disease_label} - {bundle_value}
{moving_site} vs {ref_site} ({ylabel})')
    ax.set_xlabel('Age')
    ax.set_ylabel(ylabel)
    ax.grid(True, alpha=0.2)
    ax.legend(loc='upper right')

    _add_density(ax_dist, y_grid, kde_ref, facecolor='royalblue', edgecolor='royalblue', alpha=0.25)
    _add_density(ax_dist, y_grid, kde_move_all, facecolor='mediumseagreen', edgecolor='mediumseagreen', alpha=0.2)
    _add_density(ax_dist, y_grid, kde_move_patients, facecolor='crimson', edgecolor='mediumseagreen', hatch='///', alpha=0.25)
    _add_density(ax_dist, y_grid, kde_move_hc, facecolor='white', edgecolor='mediumseagreen', hatch='++', alpha=0.0)

    density_handles = [
        mpatches.Patch(facecolor='royalblue', edgecolor='royalblue', label=f'{ref_site} HC'),
        mpatches.Patch(facecolor='mediumseagreen', edgecolor='mediumseagreen', label=f'{moving_site} all'),
        mpatches.Patch(facecolor='crimson', edgecolor='mediumseagreen', hatch='///', label=f'{moving_site} patients'),
        mpatches.Patch(facecolor='white', edgecolor='mediumseagreen', hatch='++', label=f'{moving_site} HC'),
    ]
    ax_dist.set_xlabel('Density')
    ax_dist.grid(False)
    ax_dist.tick_params(labelleft=False)
    ax_dist.set_xlim(left=0)
    ax_dist.legend(handles=density_handles, loc='upper right')

    filename = f'{bundle_value}_{value_col}.png'.replace(' ', '_')
    out_path = os.path.join(OUTPUT_DIR, filename)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.show()
    plt.close(fig)

