# Analysis on evaluation results on entire datasets

Assuming you've run data through the algorithm, and have tabulated results in the `data/results` folder.

In [30]:
from ppg2rr.evaluate import bland_altman, violin_comparison, scatter_compare, plot_rr_to_hr_Nyquist
import ppg2rr.evaluate as eval
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy import stats
import sys
from datetime import datetime

In [31]:
%load_ext autoreload
%autoreload 2

# only display 2 decimals
pd.options.display.float_format = '{:.2f}'.format

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Data

In [32]:
# update as appropriate
# filenames = [
#     'mimic_all PPG dynamic_20240919-081144_features.csv',
#     'vortal_all PPG dynamic_20240919-083528_features.csv',
#     'capnobase_all PPG dynamic_20240918-181506_features.csv',
#     'kapiolani_all PPG dynamic_20240920-170337_features.csv',
# ]

filenames = [
    '3ps_Tr1_all-sessions_20250815-170018_features.csv',
    # '3ps_Re1_all-sessions_20250815-112213_features.csv',
]


if len(filenames) == 1:
    df = pd.read_csv(f"../data/results/{filenames[0]}")
else:
    dfs = list(pd.DataFrame())
    for filename in filenames:
        dfs.append(pd.read_csv(f"../data/results/{filename}", low_memory=False))

    df = pd.concat(dfs).copy()      # The copy() avoids fragmentation warnings later.


Columns (27,31) have mixed types. Specify dtype option on import or set low_memory=False.



# Settings

These affect the entire workbook—almost. In "Bias and LoA vs Quality," this `poor_quality_frames` is not used; `poor_quality_frames_special` is used instead.

In [33]:
save_figs = False

# The RR reference
rr_reference = 'RR ref (mean)'

# The focused-on RR estimate
rr_estimated = 'mean of fused candidates'

# For convenience later, write this error into a column
rr_error = "RR error"
df[rr_error] = df[rr_estimated] - df[rr_reference]

# The rules for which frames are ommitted
poor_quality_frames = ( 
        ( df['no rr ref'])
        # | df['aliased']
        # | (df['co2 mean iqr'] < 0.15)
        # | ( df['quality - template matching-pct diagnostic quality pulses'] < 0.5)

        # Problem sessions
        # | (df['dataset-id'] == 'kapiolani 013')      # bad HR ref sustained
        # | (df['dataset-id'] == 'kapiolani 151') )    # bad HR ref at first
        # | (df['dataset-id'] == 'vortal 033') )       # frequently above Nyquist
        # | ( df['subject id'] == '0115_8min' )        # has bad reference data

        # Panel restrictions: these frames lack a good reference RR
        | (df['RR uncertainty panel (mean)'] > 0)      # exclude all uncertain frames
        | (df['RR ref disagreement panel (bpm)'] > 3)

        # Algorithm restrictions: we found that these checks avoid most erroneous results
        | ( df['HR disagreement of means'] > 3 )
        | ( df['HR est reliable'] == False )
        | (df['quality - fusion candidate quality-std'] > 1.4)
)

df_wo_poor = df[~poor_quality_frames].copy()
df_w_poor = df[poor_quality_frames].copy()

print(f"Frames rejected as poor: {len(df_w_poor):4d} of {len(df):4d} ({len(df_w_poor)/len(df):5.1%})")

df_wo_poor = df_wo_poor.dropna(subset=[rr_estimated])

print(f" ...or without a result: {len(df) - len(df_wo_poor):4d} of {len(df):4d} ({(len(df) - len(df_wo_poor))/len(df):5.1%})")

print()
# print(f"         Aliased frames: {len(df_wo_poor[df_wo_poor['aliased']]) / len(df_wo_poor.dropna(subset=[rr_estimated])):7.2%}")

Frames rejected as poor: 13378 of 15779 (84.8%)
 ...or without a result: 13378 of 15779 (84.8%)



In [34]:
df_wo_poor

Unnamed: 0,trial-frame,index,frame index,time start,time end,RR uncertainty panel (mean),RR uncertainty panelist D (mean),RR uncertainty panelist J (mean),RR uncertainty panelist O (mean),avg rr ref panelist,...,quality - psd n valid peaks-notch area ratio,quality - psd n valid peaks-notch rel amp,quality - psd n valid peaks-ppg,quality - psd n valid peaks-product of all PSD,quality - template matching-mean loc nonconforming pulses,quality - template matching-pct amplitude outliers,quality - template matching-pct diagnostic quality pulses,quality - template matching-pct good pulse shapes,quality - template matching-pct poor pulse shapes,RR error
31,1-0,0,0,0,30,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,12.00,9.00,1,1,0.5571428571428572,0.00,0.88,0.88,0.12,0.11
32,1-1,1,1,5,35,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,12.00,10.00,2,1,0.3904761904761905,0.00,0.88,0.88,0.12,0.21
33,1-2,2,2,10,40,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,5.00,21.00,3,1,0.6546610169491525,0.07,0.86,0.90,0.10,0.10
34,1-3,3,3,15,45,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,7.00,7.00,5,1,0.7288135593220338,0.03,0.78,0.78,0.22,-0.53
36,1-5,5,5,25,55,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,8.00,7.00,6,2,0.6,0.00,0.62,0.62,0.38,-2.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15490,499-21,21,21,105,135,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,6.00,6.00,2,0,0.5416666666666667,0.12,0.82,0.89,0.11,-0.66
15550,501-19,19,19,95,125,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,22.00,18.00,16,0,0.46734693877551026,0.09,0.70,0.71,0.29,-32.34
15641,504-17,17,17,85,115,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,8.00,9.00,7,0,0.6767676767676768,0.09,0.82,0.86,0.14,0.18
15677,505-22,22,22,110,140,0.00,0.00,0.00,0.00,defaultdict(<function <lambda> at 0x15593d000>...,...,12.00,12.00,4,0,0.5963800904977375,0.09,0.74,0.78,0.22,1.40


# Descriptive statistics

In [35]:
print(f"{df_wo_poor['dataset-id'].nunique()} unique dataset-ids")
print()
print(f"RR reference (bpm):")
print(f"{np.nanmean(df_wo_poor[rr_reference]):.1f} mean")
print(f"{np.percentile(df_wo_poor[rr_reference].dropna(), 25):.1f} 25%ile")
print(f"{np.percentile(df_wo_poor[rr_reference].dropna(), 50):.1f} 50%ile")
print(f"{np.percentile(df_wo_poor[rr_reference].dropna(), 75):.1f} 75%ile")

230 unique dataset-ids

RR reference (bpm):
31.6 mean
24.5 25%ile
29.3 50%ile
37.0 75%ile


In [36]:
# Reference RR vs. age

fig_rr_vs_age = eval.scatter_with_marginal(
    df_wo_poor,
    y_key=rr_reference,
    x_key="subject age",
    y_label="RR reference (bpm)",
    x_label="Age (years)",
    title="Reference RR vs. age",
    sub_title=f"among {len(df_wo_poor)} frames from {df_wo_poor['dataset-id'].nunique()} sessions",
    show_linear=False,
    xbins_size=0.5,
    ybins_size=3,
    x_tick=1,
    y_tick=20,
    y_range=[0,100],
    x_range=[-0.1,5.1],
    pt_color="darkgreen",
    height=400,
    width=500,
    show=True,
    save_as="../data/results/fig_1_ref_RR_vs_age" if save_figs else ""
)

In [37]:
plot_rr_to_hr_Nyquist(
    df_wo_poor, 
    'HR ref (mean)', 
    'Ref RR/HR (mean)', 
    'RR ref (mean)', 
    minRRtoHRpertrial = 0, 
    thresholds = [0.5], 
    opacity = 0.3, 
    showCountour = False, 
    bins = 25, 
    width = 500, 
    height = 400,
    save_as="../data/results/fig_2_Nyquist_check" if save_figs else ""
)

In [38]:
RR_threshold = 40
print(f"Sessions with reference RR > {RR_threshold} bpm:")
with pd.option_context('display.max_rows', None):
    print('\n'.join(map(str, df_wo_poor[df_wo_poor[rr_reference] > RR_threshold]['subject id'].unique())))

Sessions with reference RR > 40 bpm:
N07-025
N07-068
N07-075
N07-076
N07-078
N07-081
N07-082
N07-083
N07-084
N07-086
N07-087
N07-091
N07-095
N07-101
N07-102
N07-103
N07-105
N07-107
N09-063
N09-067
N09-069
N09-084
N09-091
N11-018
N11-022
N11-023
N11-024
N11-026
N11-034
N11-039
N11-041
N11-043
N11-045
N11-049
N12-033
N12-034
N12-037
N12-040
N12-047
N12-056
N12-057
N12-059
N12-063
N12-087
N12-096
N12-098
N12-104
N12-106
N12-112
N12-115
N12-117
N12-118
N12-123
N14-018
N14-038
N14-039
N14-040
N14-043
N14-049
N14-063
N14-066
N14-072
N14-082
N14-098
N15-003
N15-006
N15-010
N15-011
N15-015
N16-007
N16-010
N16-011
N16-012
N16-019
N16-023
N19-011
N19-013
N19-018
N19-020
N19-035
N19-046
N19-061
N19-072
N19-074
N19-078
N19-079


# Alternate Fusion Methods
Experiment with different fusion methods

In [39]:
from ppg2rr.util import mode_within_tolerance

def mode_of_each_row(row):
    return mode_within_tolerance(row,tol=5)
df['psd mode'] = df.filter(like="candidate - harmonic analysis").apply(mode_of_each_row, axis=1)
df['simple mode'] = df.filter(like="candidate - ").apply(mode_of_each_row, axis=1)
df['counting mode'] = df.filter(like="candidate - riv peak median delta").apply(mode_of_each_row, axis=1)

Error vs Quality vs Fusion methods

In [40]:
fusion_candidates = [
    "PSD, closest to prev RR",
    "PSD median",
    # "Counting, median # peaks",
    # "Counting, median pk delta rqi cutoff",
    "Counting, median pk delta std cutoff",
    "kalman",
    "simple median",
    ]

df[fusion_candidates]

# Linear regression requires no NaN
df_this_cell = df_wo_poor[[rr_reference, 'median of fused candidates', 'mean of fused candidates', 'quality - fusion candidate quality-std']].dropna()

def fig_add_linear_regression(x, y):
    x_reg = np.array(x).reshape((-1,1))
    y_reg = np.array(y).reshape((-1,1))
    model = LinearRegression().fit(x_reg, y)
    y_fit = model.predict(x_reg)
    r_sq = model.score(x_reg, y_reg)

    fig.add_scatter(
        x = x,
        y = y_fit,
        mode='lines',
        name = f'linear, r² = {r_sq:.2f}'
        )

# Median

observed_label = 'median of fused candidates'
err = abs(((df_this_cell[observed_label]-df_this_cell[rr_reference])/df_this_cell[rr_reference]))
x_median = df_this_cell['quality - fusion candidate quality-std']
y_median = err

fig = go.Figure()
fig.add_scatter(
    x = x_median,
    y = y_median,
    mode='markers',
    opacity=0.7,
    name='median'
)
fig_add_linear_regression(x_median, y_median)

# Mean

observed_label = 'mean of fused candidates'
err = abs(((df_this_cell[observed_label]-df_this_cell[rr_reference])/df_this_cell[rr_reference]))
x_mean = df_this_cell['quality - fusion candidate quality-std']
y_mean = err
fig.add_scatter(
    x = x_mean,
    y = y_mean,
    mode='markers',
    opacity=0.7,
    name='mean',
)
fig_add_linear_regression(x_mean, y_mean)


fig.update_layout(
    xaxis_title = 'Standard deviation across the RR fusion candidates',
    yaxis_title = 'error',
    width = 800,
    height = 500
)
fig.update_yaxes(tickformat = ',.0%')

## For NIMR and other datasets with multiple raters of breaths

In [41]:
# Lone exclusions: Any frames that only one panelist marked uncertain, in case that panelist wants to take a second look

metric_base = 'RR uncertainty'
metrics = [
    'panelist D',
    'panelist J',
    'panelist O',
    'panel', 
]
metric_suffix = '(mean)'
metric_panel = f'{metric_base} {metrics[-1]} {metric_suffix}'
if metric_panel in df_wo_poor.columns:
    print("Sessions in which a frame was excluded by only one panelist:")
    print()
    for metric in metrics[0:3]:    # panelists
        metric_full = f'{metric_base} {metric} {metric_suffix}'
        if metric_full in df_wo_poor.columns:
            hits = df_wo_poor[(df_wo_poor[metric_full] > 0.8) & (df_wo_poor[metric_panel] < 0.35)]['subject id'].unique()
            print(f'{metric}: {len(hits)} sessions')
            if len(hits):
                print('\n'.join(map(str, hits)))
            print()

Sessions in which a frame was excluded by only one panelist:

panelist D: 0 sessions

panelist J: 0 sessions

panelist O: 0 sessions



In [42]:
# Cumulative distribution of RR ref disagreement for each panelist and for the panel

metric_base = 'RR ref disagreement'
metrics = [
    # 'panelist D',
    # 'panelist J',
    # 'panelist O',
    'panel', 
]
metric_suffix = '(bpm)'
if f'{metric_base} {metrics[-1]} {metric_suffix}' in df_wo_poor.columns:
    line_opacity = 0.5
    panelist_colors = [             # Using brightened Adobe Premiere label colors.
        # "rgb( 70, 178, 200)",       # iris
        # "rgb( 91,  31, 255)",       # violet
        # "rgb( 68, 152, 136)",       # teal
        "red",       # for all-panel
    ]
    fig = go.Figure()
    for i, metric in enumerate(metrics):
        data = df_wo_poor[f'{metric_base} {metric} {metric_suffix}'].dropna()
        fig.add_trace(go.Scatter(x=np.sort(data), y=np.arange(1, len(data)+1) / len(data), mode='lines', opacity=line_opacity, line=dict(color=panelist_colors[i]), name=f'within {metric} ({len(data)} frames)'))
    fig.update_layout(width=500, height=400, title=f'Cumulative distribution of {metric_base}', xaxis_title=f'{metric_base} {metric_suffix}', yaxis_title='Fraction of frames below<BR>this level')
    fig.update_yaxes(range=[0,1], dtick=0.1)
    fig.update_xaxes(range=[0,30], dtick=5)
    fig.add_vline(x=3, line_color='gray', line_width=0.8, line_dash='dot', annotation_text='3', annotation_position='bottom')
    fig.update_layout(showlegend=False, legend=dict(xanchor='right', x=0.97, yanchor='bottom', y=0.04))
    # fig.update_layout(showlegend=True, legend=dict(xanchor='right', x=0.97, yanchor='bottom', y=0.04))
    fig.update_layout(hoverlabel_namelength=-1)
    fig.show()
    if save_figs:
        current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
        fig.write_image(f"../data/results/fig_ECDF_RR_ref_{current_time}.png", scale=4)


In [43]:
if 'RR uncertainty panel (mean)' in df_wo_poor.columns:
    eval.disagreement_vs_uncertainty(df_wo_poor, threshold=0.0, height=700, width=500, title='Is panel disagreement related to uncertainty?')

Using an uncertainty threshold of 0.0:

                                  [4mOverall[0m        [4mUncertainty ≤ thresh[0m   [4mUncertainty > thresh[0m
Fraction of all frames:            100.0%               100.0%                   0.0%
Mean disagreement:            0.9 bpm or  2.8%     0.9 bpm or  2.8%       nan bpm or  nan%
Median disagreement:          0.4 bpm or  1.3%     0.4 bpm or  1.3%       nan bpm or  nan%
Fraction with dis. > 3 bpm:          0.0%                 0.0%                   0.0%
Fraction with dis. > 6 bpm:          0.0%                 0.0%                   0.0%



Mean of empty slice.


invalid value encountered in scalar divide



# Performance Summary

Tabulated Bias and LoA for various candidates. See the [wiki](https://github.com/new-horizons/pulseox_pubalgs/wiki) for more information about these metrics.

In [44]:
# performance for different emsembling strategies

eval_results_dfs = pd.DataFrame()
for col in [
    'kalman',
    'simple median',
    'mean of fused candidates',
    'median of fused candidates',
    'buffered_display',
    'PSD median',
    "PSD, closest to prev RR",
    "psd mode",
    "simple mode",
    "counting mode",
    ]:
    eval_results_df, _, _ = eval.get_eval_metrics(
        df = df[~poor_quality_frames],
        reference_label=rr_reference,
        observed_label = col, 
        aggregate=False,
        as_df=True)
    eval_results_df['method'] = col
    eval_results_dfs = pd.concat([eval_results_dfs, eval_results_df], axis=0)
    
display(eval_results_dfs.set_index('method'))

Unnamed: 0_level_0,bias,loa,r2,pct in tolerance,n,median MAE,median RMSE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
kalman,0.05,4.83,0.93,97.13,2401,,
simple median,-0.15,5.04,0.91,97.04,2401,,
mean of fused candidates,0.07,4.73,0.93,97.46,2401,,
median of fused candidates,0.01,4.79,0.93,97.13,2401,,
buffered_display,0.01,4.73,0.93,97.48,2401,,
PSD median,-0.07,4.88,0.92,97.04,2401,,
"PSD, closest to prev RR",-0.1,5.18,0.91,96.5,2401,,
psd mode,-0.06,6.38,0.88,94.5,2401,,
simple mode,0.09,5.53,0.9,96.04,2401,,
counting mode,0.34,6.81,0.86,94.67,2401,,


Bland-Altman and scatterplot

In [45]:
observed_label = "mean of fused candidates"
# observed_label = "PSD median"
# observed_label = "buffered_display"
# observed_label = "RR ref disagreement (bpm)"

clipped = df_wo_poor['clipping']
if len(clipped) == 0:
    skipclipped = True
else:
    skipclipped = False
skipclipped = True

# hovertext_columns = [df_wo_poor['quality - template matching-pct diagnostic quality pulses'], df_wo_poor['trial-frame']]
hovertext_columns = None

fig, metrics =scatter_compare(df=df_wo_poor, 
                            reference_label=rr_reference, 
                            observed_label = observed_label,
                            scatter_label= 'RR Estimates',
                            hovertext_columns = hovertext_columns,
                            tolerance = 0,
                            plot_max = 100,
                            show=skipclipped,
                            )
if skipclipped == False:
    fig, metrics =scatter_compare(df=df[~poor_quality_frames & clipped], 
                                reference_label=rr_reference, 
                                observed_label = observed_label,
                                scatter_label= 'Clipped PPG',
                                hovertext_columns = hovertext_columns,
                                show=False,
                                fig=fig
                                )
    fig, metrics =scatter_compare(df=df[~poor_quality_frames & ~clipped], 
                                reference_label=rr_reference, 
                                observed_label = observed_label,
                                scatter_label= "Unclipped PPG",
                                hovertext_columns = hovertext_columns,
                                fig=fig,
                                show=True,
                                )

ba, _ = bland_altman(df=df_wo_poor, 
                     reference=df_wo_poor[rr_reference],
                     observed=df_wo_poor[observed_label],
                     reference_label = rr_reference, 
                     observed_label = observed_label,
                     x_range = [0, 80],
                     y_range = [-35,15],
                     width = 500*.9,
                     height = 400*.9,
                     save_as="../data/results/fig_3_Bland-Altman" if save_figs else ""
                     )
ba.show()

In [46]:
# Compare accuracy of algorithm to that of live annotation

if True:
    width = 500 * .9
    height = 480
    x_range = [0,80]
    y_range = [-50,30]

    save_figs = True
    ba_algo, _ = bland_altman(df=df_wo_poor, 
                        reference=df_wo_poor[rr_reference],
                        observed=df_wo_poor[observed_label],
                        reference_label = rr_reference, 
                        observed_label = observed_label,
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                         save_as="../data/results/fig_8_BA_big_algo" if save_figs else ""
                        )
    ba_algo.show()

    ba_live, _ = bland_altman(df=df_wo_poor, 
                        reference=df_wo_poor[rr_reference],
                        observed=df_wo_poor["rr annot live"],
                        reference_label = rr_reference, 
                        observed_label = "rr annot live",
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                         save_as="../data/results/fig_9_BA_big_live" if save_figs else ""
                        )
    ba_live.show()

In [47]:
# Evaluate each panelist against the panel average

if True:
    x_range = [0, 150]
    y_range = [-6, 6]
    width = 400*.9
    height = 400*.9
    
    for panelist in ['D', 'J', 'O']:
        observed_label = f"avg rr ref panelist {panelist}"
        ba, _ = bland_altman(df=df_wo_poor, 
                            reference=df_wo_poor[rr_reference],
                            observed=df_wo_poor[observed_label],
                            reference_label = rr_reference, 
                            observed_label = observed_label,
                            x_range = x_range,
                            y_range = y_range,
                            width = width,
                            height = height,
                            save_as=f"../data/results/fig_BA_panelist_{panelist}" if save_figs else ""
                            )
        ba.show()

In [48]:
# Evaluate each panelist against the average of the other two panelists

if True:
    x_range = [0, 150]
    y_range = [-6, 6]
    width = 400*.9
    height = 400*.9
    
    panelist = "D"
    observed_label = f"avg rr ref panelist {panelist}"
    ref1_label     = f"avg rr ref panelist J"
    ref2_label     = f"avg rr ref panelist O"
    ba, _ = bland_altman(df=df_wo_poor, 
                        reference=(df_wo_poor[ref1_label] + df_wo_poor[ref2_label])/2,
                        observed=df_wo_poor[observed_label],
                        reference_label = "other panelists", 
                        observed_label = panelist,
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                        save_as=f"../data/results/fig_BA_panelist_{panelist}" if save_figs else ""
                        )
    ba.show()

    panelist = "J"
    observed_label = f"avg rr ref panelist {panelist}"
    ref1_label     = f"avg rr ref panelist D"
    ref2_label     = f"avg rr ref panelist O"
    ba, _ = bland_altman(df=df_wo_poor, 
                        reference=(df_wo_poor[ref1_label] + df_wo_poor[ref2_label])/2,
                        observed=df_wo_poor[observed_label],
                        reference_label = "other panelists", 
                        observed_label = panelist,
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                        save_as=f"../data/results/fig_BA_panelist_{panelist}" if save_figs else ""
                        )
    ba.show()

    panelist = "O"
    observed_label = f"avg rr ref panelist {panelist}"
    ref1_label     = f"avg rr ref panelist D"
    ref2_label     = f"avg rr ref panelist J"
    ba, _ = bland_altman(df=df_wo_poor, 
                        reference=(df_wo_poor[ref1_label] + df_wo_poor[ref2_label])/2,
                        observed=df_wo_poor[observed_label],
                        reference_label = "other panelists", 
                        observed_label = panelist,
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                        save_as=f"../data/results/fig_BA_panelist_{panelist}" if save_figs else ""
                        )
    ba.show()

In [49]:
# Similarly compare the live annotation to the panel's average
if True:
    y_range = [-40, 40]
    
    ba_live2, _ = bland_altman(df=df_wo_poor, 
                        reference=df_wo_poor[rr_reference],
                        observed=df_wo_poor["rr annot live"],
                        reference_label = rr_reference, 
                        observed_label = "rr annot live",
                        x_range = x_range,
                        y_range = y_range,
                        width = width,
                        height = height,
                         save_as="../data/results/fig_BA_live" if save_figs else ""
                        )
    ba_live2.show()

In [50]:
# Plot error vs. age
fig_error_v_age = eval.scatter_with_marginal(
    df_wo_poor, 
    y_key=rr_error, 
    x_key='subject age', 
    y_label="RR error (bpm)", 
    x_label="Age (years)", 
    pt_color="darkorange",
    title='RR error vs. age', 
    sub_title=f"among {len(df_wo_poor)} frames from {df_wo_poor['dataset-id'].nunique()} sessions",
    xbins_size=0.5, 
    x_tick=1, 
    y_tick=10, 
    x_range=[-0.1,5.1], 
    y_range=[-35,15], 
    show_linear=True, 
    width=500, 
    height=400,
    save_as="../data/results/fig_4_RR_error_vs_age" if save_figs else ""
)

In [51]:
# Skin tone

if "ITA mean" in df_wo_poor.columns:
    ITA_range = [-90, 30]    # limits of plot range
    fig_width = 600

    # Distribution of ITA (skin tone) measurements
    participants_df = df_wo_poor[["dataset-id", "ITA mean", "ITA SD"]].drop_duplicates().dropna()

    if (np.min(participants_df["ITA mean"]) < ITA_range[0]) or (np.max(participants_df["ITA mean"]) > ITA_range[1]):
        print(f"Warning: the range of mean ITA was {np.min(participants_df['ITA mean']):0.0f} - {np.max(participants_df['ITA mean']):0.0f}, exceeding the set range in the following plots.")

    fig2 = go.Figure(data=(go.Histogram(x=participants_df["ITA SD"], xbins=dict(size=0.1))))
    fig2.update_layout(title=f"Distribution of ITA SD among <BR>{len(participants_df)} participants", xaxis_title="Standard deviation of ITA (°)", yaxis_title="count", bargap=0.01, width=350, height=250)
    fig2.update_xaxes(rangemode='tozero')
    fig2.show()

    # RR error vs. ITA
    fig_error_v_ita = eval.scatter_with_marginal(
        df_wo_poor, 
        rr_error, 
        'ITA mean', 
        x_label="Mean ITA (°)", 
        y_label=f'{rr_error} (bpm)', 
        xbins_size=10,
        # ybins_size=1,
        title='RR error vs. skin tone', sub_title=f'Among {len(df_wo_poor["ITA mean"].dropna())} frames from {len(participants_df)} sessions', 
        pt_color='brown', 
        show_linear=True, 
        x_range=ITA_range, 
        y_range=[-35,15], 
        x_tick=10,
        y_tick=10, 
        width=500, 
        height=400,
        save_as="../data/results/fig_5_RR_error_vs_skin_tone" if save_figs else ""
    )

In [52]:
# Reference RR vs. skin tone

if "ITA mean" in df_wo_poor.columns:
    fig_rr_vs_skin_tone = eval.scatter_with_marginal(
        df_wo_poor,
        y_key=rr_reference,
        x_key="ITA mean",
        y_label="RR reference (bpm)",
        x_label="Mean ITA (°)", 
        title="Reference RR vs. skin tone",
        sub_title=f'among {len(df_wo_poor["ITA mean"].dropna())} frames from {len(participants_df)} sessions', 
        show_linear=False,
        xbins_size=10,
        ybins_size=3,
        x_tick=10,
        y_tick=20,
        y_range=[0,100],
        x_range=[-90, 30],
        pt_color='brown', 
        height=400,
        width=500,
        show=True,
        save_as="../data/results/fig_1_ref_RR_vs_ITA" if save_figs else ""
)

# Bias and LoA vs Quality
Compute a plot to quantify algorithm performance vs PPG quality. On the x-axis, datapoints with quality smaller than x are excluded, and the remaining datapoints are used to calculate performance. The purpose is to evaluate how well quality correlates with confidence in algorithm performance.

In [53]:
avg_these_columns = [
    "PSD, closest to prev RR",
    "PSD median",
    "Counting, median # peaks",
    # "Counting, median pk delta rqi cutoff",
    "Counting, median pk delta std cutoff",
    "kalman",
    ]
df['exp fusion'] = df[avg_these_columns].mean(axis=1)

In [54]:
if False:
    # observed_label = 'buffered_display'
    observed_label = 'mean of fused candidates'
    # observed_label = 'exp fusion'

    total_frames = df.shape[0]
    pct_frames = []
    retained = []
    eval_results_lst = []
    xaxis =[]

    # test_threshold_range = list(np.arange(50,85))
    # test_threshold_range.extend(np.arange(86,98,0.2))
    # test_threshold_range.extend(np.arange(98.1,100,0.1))

    test_threshold_range = list(np.arange(0,80,1))
    test_threshold_range.extend(np.arange(80,100,0.5))
    # test_threshold_range.extend(np.arange(98.1,100,0.1))
    for i in test_threshold_range:
        # As noted at the top of this notebook, the notebook-wide poor_quality_frames is not used here.
        poor_quality_frames_special = (
            df['aliased']
            | ( df['quality - template matching-pct diagnostic quality pulses'] < i/100)
            )
        try:
            eval_results, _, _ = eval.get_eval_metrics(
                df[~poor_quality_frames_special], 
                reference_label = rr_reference, 
                observed_label = observed_label
                )
            eval_results_lst.append(eval_results)
            
            xaxis.append(i)
            
            pct_poor_quality = sum(poor_quality_frames_special)/total_frames
            pct_frames.append(pct_poor_quality)
            number_datapoints_retained = sum(~poor_quality_frames_special)
            retained.append(number_datapoints_retained)
        except:
            print("test_threshold_range {:>3.1f}: failed. This will happen if all frames were judged poor quality.".format(i))
            pass

    eval_results_df = pd.DataFrame(eval_results_lst)

    eval_results_df['threshold'] = xaxis
    eval_results_df['pct excluded'] = pct_frames

    fig = make_subplots(
                rows=4,
                cols=1,
                shared_xaxes=True,
                vertical_spacing=0.02,
                row_heights=[0.3,0.3,0.3,0.3],  # relative heights of each row
            )

    fig.add_scatter(x=xaxis, y=eval_results_df['bias'], name='bias', mode='lines', row=1, col=1)
    fig.add_scatter(x=xaxis, y=eval_results_df['loa'], name='LoA',mode='lines', row=2, col=1)
    fig.add_scatter(x=xaxis, y=eval_results_df['r2'], name='r2',mode='lines', row=3, col=1)
    fig.add_scatter(x=xaxis, y=retained, name='datapoints retained',mode='lines', row=4, col=1)

    fig.add_hline(y=2, line_dash="dash", row=1, annotation={'text':'bias threshold','xanchor':'left','x':'0'})
    fig.add_hline(y=7.5, line_dash="dash", row=2, annotation={'text':'LoA threshold','xanchor':'left','x':'0'})
    fig.add_vline(x=50, line_dash="dash", annotation={'text':'50%'})
    fig.add_vline(x=84, line_dash="dash", annotation={'text':'84%'})

    # fig.add_vline(x=77, line_dash="dash", annotation={'text':'77%'})
    # fig.add_vline(x=99.8, line_dash="dash", annotation={'text':'99.8%'})
    fig.update_xaxes(title_text="Minimum percentage of diagnostic quality pulses", row=4, col=1)
    fig.update_layout(height=600)
    fig.show()


# Error vs Quality

In [55]:
observed_label = 'exp fusion'

err = abs(((df[observed_label]-df[rr_reference])/df[rr_reference]))

hover_text = df['trial-frame']
hover_text += '<br>'
hover_text += '% err:' + np.round(err,0).astype(str)

build_df = pd.DataFrame()
build_df['pulse shapes'] = df['quality - template matching-pct poor pulse shapes'][~poor_quality_frames].values
build_df['amplitude outliers'] = df['quality - template matching-pct amplitude outliers'][~poor_quality_frames].values
build_df['error'] = err[~poor_quality_frames].values

fig = go.Figure()
fig.add_trace(go.Scatter(
        x=df['quality - template matching-pct diagnostic quality pulses'][~poor_quality_frames],
        y=build_df['error'],
        # z=build_df['amplitude outliers'],
        mode='markers',
        marker=dict(
                color=build_df['amplitude outliers'],
                showscale=True,
                ),
        text=hover_text[~poor_quality_frames],
        name='has outliers in frame center'
        ))
fig.update_layout(
        xaxis_title = 'pct diagnostic quality pulses',
        yaxis_title = 'error, mean of merged candidates',
)
fig.update_yaxes(tickformat = ',.0%')

# Subject-Wise metrics
Inspect the performance per specific trials under Subject-Wise metrics. This helps us identify trials with poor performance, so we can inspect these troublesome PPG examples and maybe further refine the algorithm using these examples.

In [56]:
observed_label = "mean of fused candidates"
# observed_label = "PSD median"

df['error (%)'] = (df[observed_label] - df[rr_reference]) / df[rr_reference] * 100
px.box(data_frame=df[~poor_quality_frames], x='trial', y='error (%)', width=1200)

In [57]:
df['error (bpm)'] = (df[observed_label] - df[rr_reference])
px.box(data_frame=df[~poor_quality_frames], x='trial', y='error (bpm)', range_y=[-20,15], width=1200)

In [58]:
# subject-wise metrics for each RR candidate

# specify RR candidates to inspect
candidate_list = [
    'PSD, closest to prev RR',
    'PSD median',
    # 'Counting, median pk delta rqi cutoff',
    # 'Counting, median # peaks',
    # 'Counting, median pk delta std cutoff',
    'kalman',
]

def format_error_df(candidates, df):
    err_df = pd.DataFrame()
    err_df['trial'] = df['trial']
    for candidate in candidates:
        err_df[candidate] = df[candidate] - df[rr_reference]
        
    melted_df = err_df[~poor_quality_frames].melt(
        id_vars='trial',
        value_vars = candidates,
        var_name='RR method', 
        value_name='err (bpm)'
    )
        
    return melted_df

melted = format_error_df(candidates=candidate_list, df=df)
fig = px.box(melted, x='trial', y='err (bpm)', color='RR method')
fig.update_layout(
    legend=dict(
        orientation="h")
    )
fig.show()
