In [None]:
import re
import numpy as np
import warnings
from importlib import reload
warnings.filterwarnings("ignore")
import os
import subprocess
from glob import glob
import pandas as pd
from ete3 import Tree

from jacksonii_analyses import bpp

In [None]:
reload(bpp)

In [None]:
fasta_loci_path = "../data/phylo/loci"
bpp_output_path = "../data/phylo/bpp"
bpp_fasta_output = f"{bpp_output_path}/loci_data.txt"
bpp_model_output = f"{bpp_output_path}/models.txt"
imap_file = f"{bpp_output_path}/imap.txt"
fasta_files = glob(f"{fasta_loci_path}/g*_0.fas")
input_guide_tree = "../data/phylo/guide_species_tree_rerooted.nwk"

In [None]:
models_df = pd.read_csv("../data/phylo/best_substitution_models.csv")
models_list = list(models_df["model"].unique())
models_set = set([re.sub("\+.*", "", i) for i in models_list])
models_set

In [None]:
model_mapping = {}
for model in models_set:
    if "K2P" in model or "K3P" in model:
        model_mapping["K2P"] = "K80"
        model_mapping["K3P"] = "K80"
    elif "TIM" in model or "TPM" in model:
        model_mapping["TIM"] = "GTR"
        model_mapping["TPM"] = "GTR"
    elif "TN" in model:
        model_mapping["TN"] = "TN93"
    elif "HKY" in model:
        model_mapping["HKY"] = "HKY"
model_mapping

In [None]:
models_df["model_bpp"] = None
for model in model_mapping.keys():
    models_df.loc[models_df["model"].apply(lambda x: model in x), "model_bpp"] = model_mapping[model]
models_df = models_df.sort_values(by=["model_bpp", "gene"]).reset_index(drop=True)
models_df["index"] = np.array(models_df.index) + 1
groupped_models = models_df.groupby("model_bpp").apply(lambda group: f"{min(group['index'])}-{max(group['index'])}")
groupped_models = groupped_models.reset_index().rename(columns={0: "loci"})
groupped_models["data_type"] = "DNA"
groupped_models = groupped_models[["loci", "data_type", "model_bpp"]]
groupped_models


In [None]:
sp_tree = Tree(input_guide_tree, format=1)
sptree_str = sp_tree.write(format=9)

In [None]:
os.makedirs(bpp_output_path, exist_ok=True)
pops = pd.read_csv(imap_file, sep="\t", header=None, names=["sample", "population"])
excluded = pd.read_csv("../data/var/admixture/admixed_individuals.csv")
pops = pops[~pops["sample"].isin(excluded["sample"])]
# make a species string for BPP
max_text_length = pops["population"].str.len().max()
sample_counts = pops["population"].value_counts().sort_index()
species_pad_list = [ " " * (max_text_length - len(pop)) + f"{pop}" for pop in sample_counts.index ]
counts_pad_list = [ " " * (max_text_length - len(str(sample_counts[pop]))) + f"{sample_counts[pop]}" for pop in sample_counts.index ]
number_of_species = len(sample_counts)
# make a number if individuals string for BPP
species_pad_str =  f"{number_of_species}  " + " ".join(species_pad_list)
counts_pad_str = " " * (len(str(number_of_species)) + 2) + " ".join(counts_pad_list)
sptree_pad_str = " " * (len(str(number_of_species)) + 2) + sptree_str
# for diploid organisms
phase_data = " ".join(["0"] * number_of_species)
print(f"{species_pad_str}\n{counts_pad_str}\n{sptree_pad_str}")

Concatenate all FASTA files into a [single Phylip file in BPP format](https://bpp.github.io/bpp-manual/bpp-4-manual/#sequence-file) 

In [None]:
number_of_loci = len(fasta_files)
with open(bpp_fasta_output, "w") as data_out:
    for index, genename in enumerate(models_df["gene"].tolist()):
        fasta_file = os.path.join(os.path.dirname(fasta_files[0]), f"{genename}_0.fas")
        phylip = bpp.to_bpp_phylip(bpp.fasta_reader(fasta_file))
        data_out.write(phylip)

groupped_models.to_csv(bpp_model_output, header=False, index=False)

Assumes that data is unphased and runs delimitation without species tree co-estimation

In [None]:
a10_ctl_file = bpp.get_a10_ctl(
    seqfile_path=os.path.basename(bpp_fasta_output),
    imapfile_path=os.path.basename(f"{bpp_output_path}/imap.txt"),
    initial_species=species_pad_str,
    individuals_in_species=counts_pad_str,
    guide_tree=sptree_pad_str,
    phase_data=phase_data,
    number_of_loci=number_of_loci,
)
a11_ctl_file = bpp.get_a11_ctl(
    seqfile_path=os.path.basename(bpp_fasta_output),
    imapfile_path=os.path.basename(f"{bpp_output_path}/imap.txt"),
    initial_species=species_pad_str,
    individuals_in_species=counts_pad_str,
    guide_tree=sptree_pad_str,
    phase_data=phase_data,
    number_of_loci=number_of_loci,
)

In [None]:
with open(f"{bpp_output_path}/A10.ctl", "w") as out:
    out.write(a10_ctl_file)
with open(f"{bpp_output_path}/A11.ctl", "w") as out:
    out.write(a11_ctl_file)

Run BPP below...

In [None]:
os.chdir("/workspace/notebooks")
os.chdir(bpp_output_path)

# Run BPP
output_log_file = "A10_output.log"

with open(output_log_file, "w") as f_out:
    try:
        subprocess.run(
            ["bpp", "--cfile", "A10.ctl"],
            check=True,
            stdout=f_out,  # Redirect stdout to a file
            stderr=f_out   # Redirect stderr to a file
        )
        print(f"Output saved to {output_log_file}")
        
    except subprocess.CalledProcessError as e:
        print(f"Command failed with exit code {e.returncode}")
        print("See log files for details.")

os.chdir("/workspace/notebooks")

In [None]:
os.chdir("/workspace/notebooks")
os.chdir(bpp_output_path)

# Run BPP
output_log_file = "A11_output.log"

with open(output_log_file, "w") as f_out:
    try:
        subprocess.run(
            ["bpp", "--cfile", "A11.ctl"],
            check=True,
            stdout=f_out,  # Redirect stdout to a file
            stderr=f_out   # Redirect stderr to a file
        )
        print(f"Output saved to {output_log_file}")
        
    except subprocess.CalledProcessError as e:
        print(f"Command failed with exit code {e.returncode}")
        print("See log files for details.")
        raise

os.chdir("/workspace/notebooks")