In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import altair as alt
from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()
import numpy as np

In [2]:
df = pd.read_excel("/Users/anton/Downloads/Supplementary Tables (VGP-Galaxy).xlsx","ST2 - Assembly full list")

# DANGER HERE
This slicing below is very arbitrary and depends on the content of the spreadsheet. Check every time!

In [3]:
# At present the table is not harmonized, so some processing is required
df = df.iloc[:73,:]

In [4]:
# If a species only has one assembly, it will be called hap1
df.loc[ ( df['Assembly version'] != 'hap1' ) & ( df['Assembly version'] != 'hap2' ), 'Assembly version'] = 'hap1'

In [5]:
for sp, taxid in ncbi.get_name_translator(df['NCBI Species'].unique()).items():
    lineage = ncbi.get_lineage(taxid[0])
    for item in lineage:
        rank = ncbi.get_rank([item])
        for key,value in rank.items():
            if value == 'phylum':
                df.loc[ df['NCBI Species'] == sp ,'phylum'] = ncbi.translate_to_names([key])[0]
            elif value == 'class':
                df.loc[ df['NCBI Species'] == sp ,'class_']  = ncbi.translate_to_names([key])[0]
            elif value == 'order':
                df.loc[ df['NCBI Species'] == sp ,'order']  = ncbi.translate_to_names([key])[0]

# DANGER HERE! 
For some stupid reason there is no `class` rank for Reptilia, so these need to be set manually:

In [7]:
df.loc[df['class_'].isna(), 'class_'] = "Reptilia"

In [8]:
for _ in df.columns: print(_)

ToLID
Species
NCBI Species
Common name
Assembly version
Pipeline version
Curated
Total bp
Genome size¹
Sequencing depth
Heterozygosity¹
Repeat content¹
Sex (metadata)
Heterogametic
Mitogenome?
Assembler
Bionano
Draft assembly folder
Unnamed: 18
Curated assembly folder
Notes
# scaffolds
Total scaffold length
Average scaffold length
Scaffold N50
Scaffold auN
Scaffold L50
Scaffold NG50
Scaffold auNG
Scaffold LG50
Largest scaffold
Smallest scaffold
# contigs
Total contig length
Average contig length
Contig N50
Contig auN
Contig L50
Contig NG50
Contig auNG
Contig LG50
Largest contig
Smallest contig
# gaps in scaffolds
Total gap length in scaffolds
Average gap length in scaffolds
Gap N50 in scaffolds
Gap auN in scaffolds
Gap L50 in scaffolds
Largest gap in scaffolds
Smallest gap in scaffolds
Base composition (A:C:G:T)
GC content %
# soft-masked bases
# segments
Total segment length
Average segment length
# gaps
# paths
Merqury completeness (hifi kmers) of contigs in ONE HAP (if pri/alt, then

In [9]:
for _ in df['class_'].unique(): print(_)

Amphibia
Aves
Actinopteri
Mammalia
Lepidosauria
Reptilia
Chondrichthyes


In [10]:
# Phylo rank for prper ordering of classes:
class_rank = {
    'Actinopteri':2,
    'Amphibia':3,
    'Aves':6,
    'Chondrichthyes':1,
    'Lepidosauria':5,
    'Mammalia':7,
    'Reptilia':4
}

In [11]:
for key in class_rank:
    df.loc[df['class_']==key, 'c_rank'] = class_rank[key]

In [12]:
df = df.sort_values(by=['c_rank','order','NCBI Species','Assembly version'],ignore_index = True).reset_index()

# DANGER HERE

The following three `groupby`'s need to be converted to a function. They are doing slightkly different things = hence the repeatitive code

In [13]:
df_for_order_chart = df.groupby(
        'order'
    ).agg(
        mn = pd.NamedAgg(column='index',aggfunc ='min'),
        mx = pd.NamedAgg(column='index',aggfunc = lambda x: x.max()+1),
        mid = pd.NamedAgg(column='index',aggfunc = lambda x: (x.min()+((x.max()-x.min())/2))),
        class_ = pd.NamedAgg(column='class_',aggfunc = max)
    ).reset_index()

In [15]:
df_for_order_chart.head()

Unnamed: 0,order,mn,mx,mid,class_
0,Anura,16,22,18.5,Amphibia
1,Artiodactyla,53,60,56.0,Mammalia
2,Atheriniformes,4,5,4.0,Actinopteri
3,Caprimulgiformes,33,34,33.0,Aves
4,Carnivora,60,61,60.0,Mammalia


In [16]:
df_for_class_chart = df.groupby(
        'class_'
    ).agg(
        mn = pd.NamedAgg(column='index',aggfunc ='min'),
        mx = pd.NamedAgg(column='index',aggfunc = lambda x: x.max()+1),
        mid = pd.NamedAgg(column='index',aggfunc = lambda x: (x.min()+((x.max()-x.min())/2))),
    ).reset_index()

In [17]:
df_for_class_chart.head()

Unnamed: 0,class_,mn,mx,mid
0,Actinopteri,4,16,9.5
1,Amphibia,16,22,18.5
2,Aves,33,53,42.5
3,Chondrichthyes,0,4,1.5
4,Lepidosauria,27,33,29.5


In [18]:
df_for_species_chart = df.groupby(
        ['NCBI Species','Assembly version']
    ).agg(
        mn = pd.NamedAgg(column='index',aggfunc ='min'),
        mx = pd.NamedAgg(column='index',aggfunc = lambda x: x.max()+1),
        mid = pd.NamedAgg(column='index',aggfunc = lambda x: (x.min()+((x.max()-x.min())/2))),
        species = pd.NamedAgg(column='NCBI Species',aggfunc = max),
        class_ = pd.NamedAgg(column='class_',aggfunc = max),
        order = pd.NamedAgg(column='order',aggfunc = max),
        size = pd.NamedAgg(column='Genome size¹',aggfunc = lambda x: (x.max()/1000000)),
        het = pd.NamedAgg(column='Heterozygosity¹',aggfunc = max),
        rep = pd.NamedAgg(column='Repeat content¹',aggfunc = max),
        s_ng50 = pd.NamedAgg(column='Scaffold NG50',aggfunc = lambda x: (x.max()/1000000)),
        c_ng50 = pd.NamedAgg(column='Contig NG50',aggfunc = lambda x: (x.max()/1000000)),
        sGap = pd.NamedAgg(column='Total gap length in scaffolds',aggfunc = lambda x: (x.max()/1000000)),
    ).reset_index()

In [19]:
df_for_species_chart.head()

Unnamed: 0,NCBI Species,Assembly version,mn,mx,mid,species,class_,order,size,het,rep,s_ng50,c_ng50,sGap
0,Acridotheres tristis,hap1,39,40,39.0,Acridotheres tristis,Aves,Passeriformes,1354.313347,0.558,30.0,29.697385,6.53036,3.114772
1,Ammodramus caudacutus,hap1,40,41,40.0,Ammodramus caudacutus,Aves,Passeriformes,1225.461744,0.236,21.4,39.973966,8.43839,4.089548
2,Ammodramus nelsoni,hap1,41,42,41.0,Ammodramus nelsoni,Aves,Passeriformes,1190.976425,0.571,21.3,40.15902,12.036358,5.142283
3,Ara ararauna,hap1,49,50,49.0,Ara ararauna,Aves,Psittaciformes,,,,,,
4,Ara ararauna,hap2,50,51,50.0,Ara ararauna,Aves,Psittaciformes,,,,,,


In [20]:
# This is super-ugly
# It needs to be partitioned into separate functions

domain_c = df['class_'].unique()
range_c = ['#d73027','#fc8d59','#fee090','#ffffbf','#e0f3f8','#91bfdb','#4575b4']

class_chart = alt.Chart(df_for_class_chart).mark_rect(opacity=1,stroke='black', strokeWidth=.5).encode(
    y = alt.Y('mn:Q',scale=alt.Scale(domain=[0, 72],nice=False),axis=None),
    y2 = 'mx:Q',
    color=alt.Color('class_:N',scale=alt.Scale(domain=domain_c,range=range_c),legend=None),
).properties(
        height=900,
        width=100,
        title = 'Class'
)

class_text = class_chart.transform_calculate(
    link='https://www.google.com/search?q=' + alt.datum.class_
).mark_text(color='black',align='center',baseline="middle",fontSize=12,fontWeight="bold",dy=-5).encode(
    y = 'mid:Q',
    text='class_:N',
    color=alt.value('black'),
    href = 'link:N',
    #color=alt.condition(
    #    alt.datum['name']== "Reptilia",
    #    alt.value("white"),
    #    alt.value("black")
    #)
)

order_chart = alt.Chart(df_for_order_chart).mark_rect(opacity=1,stroke='black', strokeWidth=.5).encode(
    y = alt.Y('mn:Q',scale=alt.Scale(domain=[0, 72],nice=False),axis=None),
    y2 = 'mx:Q',
    color=alt.Color('class_:N',scale=alt.Scale(domain=domain_c,range=range_c),legend=None),
    opacity=alt.Opacity('order:N',legend=None)
).properties(
        height=900,
        width=100,
    title = 'Order'
)

order_text = order_chart.transform_calculate(
    link='https://www.google.com/search?q=' + alt.datum.order
).mark_text(color='black',align='center',baseline="middle",fontSize=10,dy=-5).encode(
    y = 'mid:Q',
    text='order:N',
    color=alt.value('black'),
    opacity=alt.value(1),
    href = 'link:N',
    #color=alt.condition(
    #    alt.datum['name']== "Reptilia",
    #    alt.value("white"),
    #    alt.value("black")
    #)
)

species_chart = alt.Chart(df_for_species_chart).mark_rect(opacity=1,stroke='black', strokeWidth=.5).encode(
    y = alt.Y('mn:Q',scale=alt.Scale(domain=[0, 72],nice=False),axis=None),
    y2 = 'mx:Q',
    #x = alt.value(10)
    color=alt.Color('class_:N',scale=alt.Scale(domain=domain_c,range=range_c),legend=None),
    opacity=alt.Opacity('order:N',legend=None)
).properties(
        height=900,
        width=350,
    title='Species'
)

species_text = species_chart.transform_calculate(
    link='https://www.google.com/search?q=' + alt.datum.species
).mark_text(color='black',align='center',baseline="middle",fontSize=8,dy=-5,fontStyle="italic").encode(
    y = 'mid:Q',
    text='species:N',
    href = 'link:N',
    #color=alt.value('black'),
    opacity=alt.value(1),
    color=alt.condition(
        alt.datum['class_']== "Mammalia",
        alt.value("black"),
        alt.value("black")
    )
)

In [21]:
def hmt(df,col,y,y2,op,scheme,m_value,format_,wid,title,stitle):
    chart = alt.Chart(df,width=wid).mark_rect().encode(
        y = alt.Y(y,scale=alt.Scale(domain=[0, 72],nice=False),axis=None),
        y2 = y2,
    color=alt.Color(col,scale=alt.Scale(scheme=scheme),legend=None),
    ).properties(
        height=900,
        #width=50,
        title={ "text": title,"subtitle":stitle }
               
    )
    
    text = chart.mark_text(color='black',align='center',baseline="middle",fontSize=8,dy=-5).encode(
    y = y,
    text=alt.Text(col,format=format_),
    #color=alt.value('black'),
    opacity=alt.value(1),
    color=alt.condition(
        alt.datum[col] > m_value,
        alt.value("white"),
        alt.value("black")
    )
    )
    return(chart + text)

In [22]:
size = hmt(df_for_species_chart,'size','mn:Q','mx:Q','order','goldred',4000,",.0f",60,'Size','(Mb)')

In [23]:
het = hmt(df_for_species_chart,'het','mn:Q','mx:Q','order','yellowgreen',.5,",.2f",60,'Het','(%)')

In [24]:
rep = hmt(df_for_species_chart,'rep','mn:Q','mx:Q','order','yellowgreen',50,",.1f",60,'Repeat','(%)')

In [25]:
s_ng50 = hmt(df_for_species_chart,'s_ng50','mn:Q','mx:Q','order','yellowgreen',4000,",.0f",60,'Scaffold NG50','(Mb)')

In [26]:
c_ng50 = hmt(df_for_species_chart,'c_ng50','mn:Q','mx:Q','order','yellowgreen',4000,",.0f",60,'Contig NG50','(Mb)')

In [27]:
sGap = hmt(df_for_species_chart,'sGap','mn:Q','mx:Q','order','yellowgreen',4000,",.0f",60, 'Gaps', '(Mb)')

In [28]:
names = alt.hconcat( (class_chart + class_text ),( order_chart + order_text ),( species_chart + species_text))
stats = alt.hconcat( size, het, rep,c_ng50, s_ng50, sGap )

In [29]:
(names | stats).configure_concat(
    spacing=0
).configure_title(fontSize=9,subtitleFontSize=8)