# BDSky Serial's Output Figures & Stats

**A note on Birth Death Skyline Models**
The results in this notebook are from a phylodynamics pipeline using Birth Death Skyline Models. Reading material on Birth Death Skyline Models can be found at:
* [Taming the BEAST Tutorial: Skyline-plots](https://taming-the-beast.org/tutorials/Skyline-plots/)
* [Stadler et al. 2012 PNAS](https://www.pnas.org/doi/full/10.1073/pnas.1207965110)


In [None]:
save_dir = None
beast_xml_path= None
xml_set_label = 'COVID Strain'

In [None]:
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )
from beast_pype.outputs import (read_xml_set_logs_for_plotting, plot_comparative_box_violin,
                            plot_skyline, plot_comparative_origin_or_tmrca, hdi_pivot)
from beast_pype.date_utilities import date_to_decimal, decimal_to_date
from beast2xml import BEAST2XML
import pandas as pd
import os
import yaml

In [None]:
if save_dir is None:
    save_dir = os.getcwd()

### Date the pipeline that produced this report was launched:

In [None]:
cwd = os.getcwd()
cwd.split('/')[-1].split('_')[0]

## Lineage definitions

In [None]:
with open(cwd + "/pipeline_run_info.yml", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = yaml.safe_load(data)
display(pipeline_run_info['parameters']['parent_workflow_parameters']['sub_lineages_mapping'])

In [None]:
# This cell retrieves all the log files for the samples you selected.
merged_log_paths = {file.replace('_merged.log', ''):f'{save_dir}/{file}' for file in os.listdir(save_dir) if file.endswith('_merged.log')}
youngest_tip_year_decimals = {}

sampling_prop_partitions = {}
rt_partitions = {}
for xml_set, path in beast_xml_path.items():
    beast_xml = BEAST2XML(path)
    youngest_tip_year_decimal = beast_xml.extract_youngest_year_decimal()
    youngest_tip_year_decimals[xml_set] = youngest_tip_year_decimal
    try:
        sampling_prop_partitions[xml_set] = youngest_tip_year_decimal - beast_xml.extract_rate_change_reverse_times('samplingRateChangeTimes')
    except:
        sampling_prop_partitions[xml_set] = None
    try:
        rt_partitions[xml_set] = youngest_tip_year_decimal - beast_xml.extract_rate_change_reverse_times('birthRateChangeTimes')
    except:
        rt_partitions[xml_set] = None

df, df_melted_for_seaborn = read_xml_set_logs_for_plotting(
    file_path_dict=merged_log_paths,
    xml_set_label=xml_set_label,
    convert_become_uninfectious_rate=True,
    youngest_tip_dates_dict=youngest_tip_year_decimals)

## Infection Period 

BD Skyline models estimate the rate of becoming uninfectious (whose inverse if the average infection period). 

In [None]:
ax = plot_comparative_box_violin(df_melted_for_seaborn, 'Infection period (per day)', xml_set_label=xml_set_label)
infection_period_hdi_df = hdi_pivot(df, 'Infection period (per day)', xml_set_label=xml_set_label)
display(infection_period_hdi_df )

# Sampling Proportion

In [None]:
sampling_prop_cols = [column for column in df.columns if column.startswith('samplingProportion')]
if len(sampling_prop_cols) > 1:
    sampling_fig, sampling_ax, sampling_hdi_df = plot_skyline(df,
                                                              youngest_tip_year_decimals,
                                                              parameter_start='samplingProportion',
                                                              y_label='Sampling Proportion ($\psi$)',
                                                              grid_size=100,
                                                              include_grid=True,
                                                              partition_year_decimals=sampling_prop_partitions,
                                                              xml_set_label=xml_set_label)
else:
    ax = plot_comparative_box_violin(df_melted_for_seaborn, sampling_prop_cols[0], xml_set_label=xml_set_label)
    sampling_hdi_df =hdi_pivot(df, sampling_prop_cols[0], xml_set_label=xml_set_label)
display(sampling_hdi_df)

# $R_T$


## True Skyline



**Note** Lower values are 0.05 Highest Posterior Density (HPD), higher values are 0.95 HPD.

In [None]:
rt_fig, rt_ax, rt_hdi_table  = plot_skyline(df,
                                            youngest_tip_year_decimals,
                                            parameter_start='reproductiveNumber',
                                            y_label='$R_t$',
                                            grid_size=100,
                                            include_grid=True,
                                            partition_year_decimals=rt_partitions,
                                            xml_set_label=xml_set_label)
display(rt_hdi_table)

### Ratio of median $R_t$s for last time period.

In [None]:
ratio_of_rts = []
xml_sets = list(beast_xml_path.keys())
for xml_set in xml_sets:
    denominator = rt_hdi_table[rt_hdi_table[xml_set_label]==xml_set]['median'].iloc[-1]
    ratio_of_rts.append({column:rt_hdi_table[rt_hdi_table[xml_set_label]==column]['median'].iloc[-1]/denominator for column in xml_sets})

ratio_of_rts = pd.DataFrame.from_records(ratio_of_rts)
ratio_of_rts.index = xml_sets
ratio_of_rts

# Origin

The origin is the time at which the index case (the first Canadian case) became infected, which is slightly earlier than the time-to-the-most-recent-common-ancestor (TMRCA). This parameter is used to investigate the detection delay from emergence to first detection in Canada.

In [None]:
fig = plot_comparative_origin_or_tmrca(df_melted_for_seaborn, 'Origin', one_figure=True, xml_set_label=xml_set_label)

In [None]:
fig = plot_comparative_origin_or_tmrca(df_melted_for_seaborn, 'Origin', xml_set_label=xml_set_label)

In [None]:
orign_hdi_df = hdi_pivot(df, 'Origin', xml_set_label=xml_set_label)
orign_hdi_df['Lower 0.95 HDI Date'] =  orign_hdi_df['Lower 0.95 HDI'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
orign_hdi_df['Median Date'] =  orign_hdi_df['Median'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
orign_hdi_df['Upper 0.95 HDI Date'] =  orign_hdi_df['Upper 0.95 HDI'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
orign_hdi_df

# TMRCA

In [None]:
fig = plot_comparative_origin_or_tmrca(df_melted_for_seaborn, 'TMRCA', one_figure=True, xml_set_label=xml_set_label)

In [None]:
fig = plot_comparative_origin_or_tmrca(df_melted_for_seaborn, 'TMRCA', xml_set_label=xml_set_label)

In [None]:
tmrca_hdi_df = hdi_pivot(df, 'TMRCA', xml_set_label=xml_set_label)
tmrca_hdi_df['Lower 0.95 HDI Date'] =  tmrca_hdi_df['Lower 0.95 HDI'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
tmrca_hdi_df['Median Date'] =  tmrca_hdi_df['Median'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
tmrca_hdi_df['Upper 0.95 HDI Date'] =  tmrca_hdi_df['Upper 0.95 HDI'].map(decimal_to_date).dt.strftime("%Y-%m-%dir")
tmrca_hdi_df