In [1]:
import pandas as pd

In [2]:
wd = "/data/gpfs/projects/punim1989/global_ocean_phylogenomics"
data_dir = "/data/gpfs/projects/punim2504/data"

In [3]:
def process_metadata():
    # Originally this had 1454, but excluded that file to avoid confusion
    metadata = pd.read_csv(f"{wd}/data/metadata_1446.csv", index_col=0)
    sandpiper_counts = pd.read_csv(f"{data_dir}/sandpiper0.3.0.condensed.csv", sep="\t", index_col=0)

    sp_samples = sandpiper_counts.index.unique()
    metadata_subset = metadata[metadata["sra_run"].isin(sp_samples)]
    in_sandpiper = metadata_subset["sra_run"]
    sandpiper_subset = sandpiper_counts.loc[in_sandpiper]

    metadata_subset.to_csv(f"{wd}/data/metadata_1446.csv")
    sandpiper_subset.to_csv(f"{wd}/data/sandpiper_1446_long.csv")

    return metadata_subset, sandpiper_subset

metadata_subset, sandpiper_subset = process_metadata()

In [4]:
def split_lineages(counts, outdir):
    lineage = counts['taxonomy'].str.split("; ", expand=True).iloc[:, 1:]
    lineage.columns = [
    "domain",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species",
    ]
    lineage["coverage"] = counts["coverage"]
    ranks = lineage.columns[:-1].to_list()
    rank_dfs = dict() 
    for rank in ranks:
        rank_dfs[rank] = lineage.reset_index().pivot_table(
        index="sample",
        columns=rank,
        values="coverage",
        aggfunc="sum",
    ).fillna(0.0).astype(float)

    for rank, df in rank_dfs.items():
        outfile = f"{outdir}/rank_{rank}.csv"
        print(f"Writing '{rank}' table to '{outfile}'.")
        df.to_csv(outfile, sep="\t", index=True, header=True)


In [5]:
split_lineages(sandpiper_subset, f"{wd}/data/rank_tables")

Writing 'domain' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_domain.csv'.
Writing 'phylum' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_phylum.csv'.
Writing 'class' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_class.csv'.
Writing 'order' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_order.csv'.
Writing 'family' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_family.csv'.
Writing 'genus' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_genus.csv'.
Writing 'species' table to '/data/gpfs/projects/punim1989/global_ocean_phylogenomics/data/rank_tables/rank_species.csv'.
