In [1]:
import os
import csv
from os.path import join

gold_standards_dir = "truths"
lyrebird_dir = "output_lyrebird/lyrebird"
metaphlan_dir = "output_metaphlan4"
phanta_dir = "output_phanta"

# marine0 to marine9
sample_ids = [f"marine{i}" for i in range(10)]

In [2]:
# get truth counts
truths = {}
for sample in sample_ids:
    truth_file = join(gold_standards_dir, f"{sample}.condensed")
    with open(truth_file, "r") as f:
        truths[sample] = len(f.readlines()[1:])
truths

{'marine0': 389,
 'marine1': 450,
 'marine2': 429,
 'marine3': 605,
 'marine4': 428,
 'marine5': 504,
 'marine6': 510,
 'marine7': 433,
 'marine8': 532,
 'marine9': 414}

In [3]:
# get lyrebird counts
lyrebird_counts = {}
for sample in sample_ids:
    lyrebird_file = join(lyrebird_dir, f"{sample}.profile")
    with open(lyrebird_file, "r") as f:
        i = 0
        for line in f:
            if "s__" in line:
                i += 1
        lyrebird_counts[sample] = i
lyrebird_counts

{'marine0': 400,
 'marine1': 469,
 'marine2': 445,
 'marine3': 582,
 'marine4': 424,
 'marine5': 502,
 'marine6': 504,
 'marine7': 416,
 'marine8': 499,
 'marine9': 437}

In [4]:
# get metaphlan counts
metaphlan_counts = {}
for sample in sample_ids:
    metaphlan_file = join(metaphlan_dir, f"{sample}.profile")
    with open(metaphlan_file, "r") as f:
        metaphlan_counts[sample] = len(f.readlines()[4:])
metaphlan_counts

{'marine0': 27,
 'marine1': 30,
 'marine2': 33,
 'marine3': 48,
 'marine4': 28,
 'marine5': 31,
 'marine6': 32,
 'marine7': 20,
 'marine8': 25,
 'marine9': 26}

In [5]:
# get phanta counts
phanta_counts = {}
for sample in sample_ids:
    phanta_file = join(phanta_dir, f"{sample}_output/final_merged_outputs/relative_taxonomic_abundance.txt")
    with open(phanta_file, "r") as f:
        phanta_counts[sample] = len(f.readlines()[1:])
phanta_counts

{'marine0': 214,
 'marine1': 258,
 'marine2': 251,
 'marine3': 306,
 'marine4': 238,
 'marine5': 284,
 'marine6': 273,
 'marine7': 218,
 'marine8': 300,
 'marine9': 245}

In [6]:
with open("species_count_to_truths.csv", "w+") as w:
    writer = csv.writer(w)
    writer.writerow(["lyrebird", "metaphlan4", "phanta"])
    for sample in sample_ids:
        writer.writerow([lyrebird_counts[sample] / truths[sample], metaphlan_counts[sample] / truths[sample], phanta_counts[sample] / truths[sample]])