In [None]:
import os
import subprocess
import pandas as pd

from jacksonii_analyses import vcf_parser, clustering

from importlib import reload

reload(vcf_parser)

os.makedirs("../data/phylo", exist_ok=True)
os.makedirs("../data/figs", exist_ok=True)

base_alignment = "../data/phylo/snp_concat.fasta"
cleaned_alignment = "../data/phylo/snp_concat.fasta.varsites.phy"
def iqtree_cmd(fasta_path):
    return [
        "iqtree",
        "-s", fasta_path,
        "-m", "GTR+G+ASC",
        "-bb", "1000",
        "-nt", "AUTO",
        "--redo"
    ]

groups = [
    'A. jacksonii', 
    'A. sp. T31', 
    'A. sp. jack6', 
    'A. sp. jack5',
    'A. sp. jack3', 
    'A. sp. jack2', 
    'A. sp. jack1', 
    'A. sp. F11',
]

palette = [
    "#1f77b4",  # blue
    "#ff7f0e",  # orange
    "#2ca02c",  # green
    "#d62728",  # red
    "#9467bd",  # purple
    "#8c564b",  # brown
    "#e377c2",  # pink
    "#7f7f7f",  # gray
]

map_colors = dict(zip(groups, palette))

In [None]:
pops = clustering.read_populations(
    "../data/samples/populations.txt",
)
pops["colormap"] = pops["populations_clean"].map(map_colors)
pops = pops.reset_index("sample")

# filtered samples
admixed_sample_set = pd.read_csv(
    "../data/var/admixture/admixture_individuals.csv",
)

# short location names
sample_loc = pd.read_csv(
    "../data/samples/samples_short_loc.csv",
)

pops = pops[~pops["sample"].isin(admixed_sample_set["sample"])]
pops = pops.reset_index(drop=True)

In [None]:
pops = pops.merge(
    sample_loc[["sample", "abbreviated_loc", "sample_loc"]],
    on="sample",
    how="left",
)
pops.head()

In [None]:
vcf_parser.vcf_to_snp_fasta(
    vcf_path="../data/var/filtered_variants.vcf.gz", 
    output="../data/phylo/snp_concat.fasta",
    drop_samples=admixed_sample_set["sample"].tolist(),
)

This part of the code ðŸ‘‡ is meant to fail. But it will generate a clean phylip file that we can use in a separate process.

In [None]:
base_run = subprocess.run(iqtree_cmd(base_alignment), capture_output=True, text=True)

print("STDOUT:\n", base_run.stdout)
print("STDERR:\n", base_run.stderr)


In [None]:
result = subprocess.run(iqtree_cmd(cleaned_alignment), capture_output=True, text=True)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

In [None]:
import warnings
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from IPython.display import Image, display

warnings.filterwarnings("ignore", category=UserWarning, module="ete3")

treefile = "../data/phylo/snp_concat.fasta.varsites.phy.treefile"
t = Tree(treefile, format=1)
t.set_outgroup(t.get_midpoint_outgroup())
t.ladderize(direction=1)

pops = pops.set_index("sample")

tb = t.copy()

for node in t.traverse():
    nstyle = NodeStyle()
    nstyle["size"] = 0  # Hide node circles
    nstyle["hz_line_width"] = 3  # Increase horizontal line width
    nstyle["vt_line_width"] = 3  # Increase vertical line width
    if node.is_leaf():
        color = pops.loc[node.name, "colormap"]
        label = pops.loc[node.name, "sample_loc"]
        node.add_face(
            TextFace(
                label, 
                fsize=8, 
                fgcolor=color, 
                ftype="Sans",
                fstyle="italic", 
                bold=False,
            ), 
        column=0,
    )
    node.set_style(nstyle)

for pop, color in map_colors.items():
    samples = list(pops[pops["populations_clean"] == pop].index)
    clade_node = t.get_common_ancestor(samples)
    clade_node.add_face(
        TextFace(pop, fsize=8, fgcolor=color, ftype="Sans", bold=False),
        column=0,
        position="branch-top"
    )

ts = TreeStyle()
ts.show_leaf_name = False
ts.scale = 4000
ts.show_scale = True

In [None]:
tb.render("../data/figs/concat-tree-black.svg", tree_style=ts)
t.render("../data/figs/concat-tree-display.png", tree_style=ts)
display(Image(filename="../data/figs/concat-tree-display.png"))
