In [6]:
from ete3 import Tree
import os
import glob
import pandas as pd

BASE_DIR = "."

# Desired order
datasets = ["dna_easy", "dna_medium", "dna_hard",
             "aa_easy", "aa_medium", "aa_hard"]

def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

rows = []

for dataset in datasets:
    dataset_path = os.path.join(BASE_DIR, dataset)
    if not os.path.isdir(dataset_path):
        print(f"‚ö†Ô∏è Dataset folder missing: {dataset}")
        continue

    print(f"\nüìÇ DATASET: {dataset}")

    # Step 1: Find the best tree for each parameter
    best_trees = {}

    # Extract only parameter folders (anything interpretable as a float)
    params = [p for p in os.listdir(dataset_path)
              if os.path.isdir(os.path.join(dataset_path, p)) and is_float(p)]

    # Sort numerically
    params = sorted(params, key=lambda x: float(x))

    for param in params:
        param_path = os.path.join(dataset_path, param)

        run_candidates = []

        for runfolder in os.listdir(param_path):
            run_path = os.path.join(param_path, runfolder)
            if not os.path.isdir(run_path):
                continue

            # Look for the final tree
            tree_files = glob.glob(os.path.join(run_path, "*_tree.newick"))
            if not tree_files:
                continue

            # Determine the Likelihood
            logl_files = glob.glob(os.path.join(run_path, "*_logl.out"))
            if logl_files:
                try:
                    with open(logl_files[0], "r") as f:
                        # Reads the last line where the final LL is located
                        ll = float(f.readlines()[-1].strip()) 
                except:
                    ll = None
            else:
                ll = None

            run_candidates.append((ll, tree_files[0]))

        if len(run_candidates) == 0:
            continue

        # Sort: best likelihood first
        run_candidates.sort(key=lambda x: (x[0] is None, -(x[0] or -9e99))) 

        best_tree_file = run_candidates[0][1]
        best_trees[param] = best_tree_file

    if len(best_trees) == 0:
        continue

    # Step 2: Reference Tree = most precise parameter
    precise_param = sorted(best_trees.keys(), key=lambda x: float(x))[0]
    ref_tree_file = best_trees[precise_param]
    ref_tree = Tree(ref_tree_file)

    print(f"üèÜ Reference Parameter (Epsilon): {precise_param}")

    # Step 3: Calculate RF per parameter
    for param, tree_file in best_trees.items():
        t = Tree(tree_file)
        # Calculation of Robinson-Foulds distance
        rf, max_rf, *_ = ref_tree.robinson_foulds(t) 
        norm_rf = rf / max_rf if max_rf > 0 else 0

        comment = "Reference (Highest Precision)" if param == precise_param else "Comparison"

        rows.append({
            "Dataset": dataset,
            "Epsilon": param,
            "Tree_File_Path": tree_file,
            "RF_Distance_Absolute": rf, 
            "RF_Distance_Max_Possible": max_rf, 
            "RF_Distance_Normalized": norm_rf, 
            "Note": comment
        })

# Final Table
df = pd.DataFrame(rows)

# Maintain dataset order
df["Dataset"] = pd.Categorical(df["Dataset"], categories=datasets, ordered=True)
df = df.sort_values(["Dataset", "Epsilon"]).reset_index(drop=True)

## --- NEUER ABSCHNITT: Speichern der CSV-Datei ---
# Definieren des gew√ºnschten Dateinamens
output_filename = "Best_Tree_RF_Distances.csv"

# Speichern des DataFrames als CSV-Datei
df.to_csv(output_filename, index=False)

print(f"\n‚úÖ Erfolgreich gespeichert als: {output_filename}")
print("---")

df


üìÇ DATASET: dna_easy
üèÜ Reference Parameter (Epsilon): 0.0001

üìÇ DATASET: dna_medium
üèÜ Reference Parameter (Epsilon): 0.0001

üìÇ DATASET: dna_hard
üèÜ Reference Parameter (Epsilon): 0.0001

üìÇ DATASET: aa_easy
üèÜ Reference Parameter (Epsilon): 0.0001

üìÇ DATASET: aa_medium
üèÜ Reference Parameter (Epsilon): 0.0001

üìÇ DATASET: aa_hard
üèÜ Reference Parameter (Epsilon): 0.0001

‚úÖ Erfolgreich gespeichert als: Best_Tree_RF_Distances.csv
---


Unnamed: 0,Dataset,Epsilon,Tree_File_Path,RF_Distance_Absolute,RF_Distance_Max_Possible,RF_Distance_Normalized,Note
0,dna_easy,0.0001,.\dna_easy\0.0001\1761852001013978_out\1761852...,0,12,0.0,Reference (Highest Precision)
1,dna_easy,0.0005,.\dna_easy\0.0005\1761841117152047_out\1761841...,0,12,0.0,Comparison
2,dna_easy,0.001,.\dna_easy\0.001\1761768389588478_out\17617683...,4,12,0.333333,Comparison
3,dna_easy,0.005,.\dna_easy\0.005\1761831853859383_out\17618318...,2,12,0.166667,Comparison
4,dna_easy,0.01,.\dna_easy\0.01\1761809190249939_out\176180919...,0,12,0.0,Comparison
5,dna_easy,0.1,.\dna_easy\0.1\1761860499615071_out\1761860499...,2,12,0.166667,Comparison
6,dna_easy,0.5,.\dna_easy\0.5\1761891836236543_out\1761891836...,0,12,0.0,Comparison
7,dna_medium,0.0001,.\dna_medium\0.0001\1761911024216573_out\17619...,0,82,0.0,Reference (Highest Precision)
8,dna_medium,0.0005,.\dna_medium\0.0005\1761993693324494_out\17619...,2,82,0.02439,Comparison
9,dna_medium,0.001,.\dna_medium\0.001\1761904159307516_out\176190...,2,82,0.02439,Comparison
