In [None]:
import os
import subprocess
import pandas as pd
from ete3 import Tree
from glob import glob
from concurrent.futures import ThreadPoolExecutor

In [None]:
def iqtree_cmd(fasta_path):
    return [
        "iqtree",
        "-s", fasta_path,
        "-m", "TEST",
        "--merit", "BIC",
        "-B", "1000",
        "--polytomy",
        "--redo",
    ]


def run_iqtree(fasta_file):
    cmd = iqtree_cmd(fasta_file)
    print(f"Running IQ-TREE for {fasta_file} with command: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"Error processing {fasta_file}: {result.stderr}")
    else:
        print(f"Successfully processed {fasta_file}")


def reroot_gene_tree(tree_path: str, outgroup: list) -> str:
    cmd = ["nw_reroot", tree_path] + outgroup
    nw_reroot_tree = subprocess.run(cmd, capture_output=True, check=True)
    return nw_reroot_tree.stdout.decode()

In [None]:
base_dir = "../data/phylo/loci/"
fasta_files = [
    os.path.join(base_dir, f) 
    for f in os.listdir(base_dir) 
    if f.endswith(".fas")
]
fasta_files[:5]


In [None]:
threads = os.cpu_count() - 5
with ThreadPoolExecutor(max_workers=threads) as executor:
    futures = [executor.submit(run_iqtree, fasta_file) for fasta_file in fasta_files]

In [None]:
with open("../data/phylo/genetrees.nwk", "w") as concat:
    for genetree_path in glob("../data/phylo/loci/*.treefile"):
        with open(genetree_path, "r") as genetree:
            concat.write(genetree.readline())

In [None]:
pops = pd.read_table("../data/samples/populations.txt", header=None, names=["sample", "population"])
outgroup_idx = pops["population"].str.endswith("T31") | pops["population"].str.endswith("F11")
outgroup_samples = pops["sample"][outgroup_idx].tolist()

Reroot gene trees

In [None]:
with open("../data/phylo/genetrees_rerooted.nwk", "w") as concat:
    for genetree_path in glob("../data/phylo/loci/*.treefile"):
        concat.write(reroot_gene_tree(genetree_path, outgroup_samples))

Export best substition models to table

In [None]:
models_df = pd.DataFrame(columns=["gene", "model", "criterion"])
genes = []
best_models = []
for fasta_file in fasta_files:
    log_file = f"{fasta_file}.log"
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            lines = f.read().splitlines()
        best_model = list(filter(lambda x: x.startswith("Best-fit model:"), lines))
        genes.append(fasta_file.split("/")[-1].replace("_0.fas", ""))
        best_models.append(best_model[0].split(" ")[2]) if best_model else None
models_df["gene"] = genes
models_df["model"] = best_models
models_df["criterion"] = "BIC"
models_df.to_csv("../data/phylo/best_substitution_models.csv", index=False)
models_df