In [1]:
import pandas as pd
import taxoniq
from Bio import SeqIO

In [24]:
d = pd.read_csv('./mt93_counts.tsv',sep="\t")
d = d.set_index("Sample_Name")

b = pd.read_csv('./mt93_coveredbases.tsv',sep="\t")
b = b.set_index("Sample_Name")

m = pd.read_csv('../METADATA/SRA_run_info_acc_Apr16.tsv', sep="\t")
d[b < 430] = 0
b[b < 430] = 0

In [25]:
d_melt = b
d_melt['Sample_Name'] = d_melt.index
d_melt = d_melt.melt(id_vars='Sample_Name', var_name='Contig',value_name='Bases_covered')
d_melt = d_melt.query("Bases_covered >= 430")

id2name = {}
for record in SeqIO.parse('./mt_derep98.fasta', 'fasta'):
    id2name[record.id] = ' '.join(record.description.replace(" UNVERIFIED: ", " ").split()[1:3])

d_melt['Species'] = d_melt['Contig'].map(id2name)

species2group = {}
not_found = set()
for genome in d_melt.Species.unique():
    try:
        t = taxoniq.Taxon(scientific_name=genome)
        ranks = [(t.rank.name, t.scientific_name) for t in t.ranked_lineage]
        if ('kingdom', 'Metazoa') in ranks:
            for rank in ranks:
                if rank[0] == 'class':
                    class_name = rank[1]
                    break
            species2group[genome] = class_name
    except:
        not_found.add(genome)

        
d_melt['Class'] = d_melt.Species.map(species2group)
d_melt = d_melt[~d_melt.Class.isna()]

groupings = {"Mammalia":"Mammal",        
"Actinopteri":"Fish",     
"Aves":"Bird",             
"Bivalvia":"Bivalves",
"Lepidosauria":"Reptile",
"Malacostraca": "Crustacean"}
d_melt['Group'] = d_melt.Class.map(groupings).fillna("Other")
d_melt = d_melt[~d_melt.Group.isna()]

species = d_melt[['Contig','Species','Class','Group', 'Bases_covered']].groupby(['Contig','Species','Class','Group']).sum().reset_index().sort_values("Bases_covered", ascending=False)

species2name = {}
for index, row in species.iterrows():
    species2name[row['Contig']] = row['Contig'] + " " + row['Species']

In [26]:
b = b[species.Contig]
d = d[species.Contig]

b.columns = b.columns.map(species2name)
b = b.reset_index()
d.columns = d.columns.map(species2name)
d = d.reset_index()

In [28]:
species.to_csv("species_descriptions.tsv", sep="\t", index=None)

In [29]:
pd.merge(m[['Run','Lab code', 'Sample_category', 'Stall_corrected']],d, left_on='Run', right_on='Sample_Name', how='right').to_csv("mitochondrial_metazoa_counts_93.tsv", sep="\t", index=None)
pd.merge(m[['Run','Lab code', 'Sample_category', 'Stall_corrected']],b, left_on='Run', right_on='Sample_Name', how='right').to_csv("mitochondrial_metazoa_coveredbases_93.tsv", sep="\t", index=None)