# Phase 1: Metadata and Sequence Separation.ipynb

<details>
    <summary>Click To See A Decription of Parameters</summary>
        <pre>
            <code>
save_dir: str  
    Path to directory for saving outputs in.

metadata_path: str
       Path to csv or tsv containing metadata.

fasta_path: str
    Path to fasta file containing sequences.

xml_set_definitions : dict {str: str}
        The definitions for the xml_sets you wish to use.
        Keys:   The name used for the xml_set. Will be used to name directories so certain characters should be
                   avoided see https://www.mtu.edu/umc/services/websites/writing/characters-avoid/.
        Values: Will be used with pandas DataFrame.query to seperate out your data see:
                        * https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
                        * https://sparkbyexamples.com/pandas/pandas-dataframe-query-examples/
                        * https://www.slingacademy.com/article/pandas-working-with-the-dataframe-query-method-5-examples/

xml_set_directories : dict {str: str}
        The directories where all the data for each xml set will be saved.

root_strain_names: list of strs
    IDs of sequences to be used as root.

sample_id_field: str
    Name of field in metadata_db containing sequence IDs.

collection_date_field: str
    Name of field in metadata_db containing collection dates of sequences. Should be format YYYY-MM-DD.

data_filter: str
    Optional can be an empy string, None or 'None'. Additional filter applieid to metadata_db when selecting 
    sequences and metadata to be used on pipeline. Must conform to [pandas documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html), see further [example](https://www.slingacademy.com/article/pandas-working-with-the-dataframe-query-method-5-examples/). 
  </code>
</pre>


In [None]:
save_dir = None
metadata_path = None
xml_set_definitions = None
xml_set_directories = None
sample_id_field = 'strain'
collection_date_field = 'date'
data_filter = None
fasta_path = None

Import pakages

In [None]:
import pandas as pd
from Bio import SeqIO

## Load metadata and filter.

In [None]:
if metadata_path.endswith('.tsv'):
    delimiter = '\t'
elif metadata_path.endswith('.csv'):
    delimiter = ','
else:
    raise TypeError(f"metadata_db must be a csv or tsv file, ending with the appropriate file extension. Value given is {metadata_path}")
metadata_all_df = pd.read_csv(metadata_path,
                              sep=delimiter,
                              parse_dates=[collection_date_field]
                              )

if data_filter is not None:
    metadata_all_df = metadata_all_df.query(data_filter)

## Separate metadata & Sequences

In [None]:
xml_set_metadata = {}
for xml_set, pd_query in xml_set_definitions.items():
    xml_set_directory = xml_set_directories[xml_set]
    if data_filter is not None:
        pd_query = f"({data_filter}) & {pd_query}"
    xml_set_metadata = metadata_all_df.query(pd_query)
    xml_set_metadata.to_csv(f'{xml_set_directory}/metadata.csv', index=False)
    ids = xml_set_metadata[sample_id_field].to_list()
    selected_seqs = [seq_record for seq_record in SeqIO.parse(fasta_path, 'fasta') if seq_record.id in ids]
    with open(f'{xml_set_directory}/sequences.fasta', 'w') as handle:
            SeqIO.write(selected_seqs, handle, 'fasta')
