# Phase 3: Setting up the BEAST xmls

<details>
    <summary>Click To See A Decription of Parameters</summary>
        <pre>
            <code>
save_dir: str  
    Path to directory for saving outputs in.
template_xml_path:
    Path to template BEAST xml.
 voi_strains: list of strs
    Names  of Variant Of Interest (VOI) lineages.
origin_upper_height_addition: int or float
    Value to add to tree height for upper limit of origin prior. Origin prior is uniformly distributed.
infection_period: float
    Suggested infection period of pathogen. **Should be in years.** This + initial MLE tree height is used as starting value of origin.
chain_length: int
    Number of chains to use for BEAST runs.
trace_log_every: int
    How often to save a log file during BEAST runs.
tree_log_every: int
    How often to save a tree file during BEAST runs.
screen_log_every: int
    How often to output to screen during BEAST runs.
store_state_every: int 
    How often to store MCMC state during BEAST runs.
  </code>
</pre>


In [None]:
save_dir = None
template_xml_path = None
voi_strains= None
origin_upper_addition = None
origin_start_addition = None
chain_length = None
trace_log_every = None
tree_log_every = None
screen_log_every = None
store_state_every = None
collection_date_field='date'
sample_id_field='strain'

Import packages. 

In [None]:
import yaml
from pandas.tseries.offsets import DateOffset
from beast_pype.beast_xml_gen import gen_bdsky_serial_xml
from copy import deepcopy
import os
import pandas as pd


We need several things from the metadata:

In [None]:
with open(save_dir + "/pipeline_run_info.yml", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = yaml.safe_load(data)
xml_set_directories = pipeline_run_info['xml set directories']
voi_youngest_tips_dates = {}
voi_oldest_tips_dates = {}
for xml_set, selection in xml_set_directories.items():
    if xml_set in voi_strains:
        metadata_file = f'{selection}/down_sampled_metadata.csv'
        if not os.path.isfile(metadata_file):
            metadata_file = f'{selection}/metadata.csv'

        xml_set_metadata = pd.read_csv(metadata_file, parse_dates=[collection_date_field])

        voi_youngest_tips_dates[xml_set] =  xml_set_metadata[collection_date_field].max()
        voi_oldest_tips_dates[xml_set] =  xml_set_metadata[collection_date_field].min()



## Generate the $R_t$ change dates.  

Back every 4 weeks from the youngest tip. For VOIs stop at the youngest tip out of the oldest tips for each sample. For DR back an extra 4 weeks. 

In [None]:
voi_youngest_tip = max(voi_youngest_tips_dates.values())
voi_youngest_oldest_tip = max(voi_oldest_tips_dates.values())
voi_re_date_changes = []
date_to_append = deepcopy(voi_youngest_tip)
while date_to_append > voi_youngest_oldest_tip:
    date_to_append = date_to_append - DateOffset(weeks=4)
    voi_re_date_changes.append(date_to_append)

res_re_date_changes = deepcopy(voi_re_date_changes)
res_re_date_changes.append(date_to_append - DateOffset(weeks=4))

## Actually Generating the BEAST2 xmls.

In [None]:
for xml_set, selection in xml_set_directories.items():
    if xml_set in voi_strains:
        strain_prefix = "VOI_"
        changes = voi_re_date_changes
    else:
        strain_prefix = "DR_"
        changes = res_re_date_changes

    fasta_file = f'{selection}/down_sampled_sequences.fasta'
    if not os.path.isfile(fasta_file):
        fasta_file = f'{selection}/sequences.fasta'

    metadata_file = f'{selection}/down_sampled_metadata.csv'
    if not os.path.isfile(metadata_file):
        metadata_file = f'{selection}/metadata.csv'

    initial_tree_path = f'{selection}/down_sampled_time.nwk'
    if not os.path.exists(initial_tree_path):
        initial_tree_path = f'{selection}/full_time.nwk'

    gen_bdsky_serial_xml(
        template_path=template_xml_path,
        sequences_path=fasta_file,
        metadata_path=f"{selection}/metadata.csv",
        sample_id_field=sample_id_field,
        collection_date_field=collection_date_field,
        initial_tree_path=initial_tree_path,
        origin_upper_height_addition=origin_upper_addition,
        origin_start_addition=origin_start_addition,
        output_path=f"{selection}/beast.xml",
        rt_change_dates=changes,
        log_file_basename=xml_set,
        chain_length=chain_length,
        trace_log_every=trace_log_every,
        tree_log_every=tree_log_every,
        screen_log_every=screen_log_every,
        store_state_every=store_state_every
    )


### Add $R_{t}$ Change dates to pipeline_run_info

In [None]:
pipeline_run_info['Re change dates'] = {'DR': [str(value.date()) for value in res_re_date_changes],
                                        'VOI': [str(value.date()) for value in voi_re_date_changes]}
with open(save_dir +'/pipeline_run_info.yml', 'w') as fp:
    yaml.dump(pipeline_run_info, fp, sort_keys=True)

fp.close()