# Phylogenetically balanced database of prokaryotes

Constructed as a subset from [GTDB release 214](https://gtdb.ecogenomic.org/).

Nickname: `DB_Proka`.

In [1]:
import os
from pathlib import Path

import pandas as pd
import requests

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

In [2]:
data_folder = Path('./data')
assert data_folder.is_dir()

## Load GTDB metadata

In [3]:
columns = [
    'accession', 'ncbi_organism_name', 'checkm_completeness', 'checkm_contamination', 'coding_bases', 'coding_density', 
    'contig_count', 'gc_count', 'gc_percentage', 'genome_size', 'protein_count', 'gtdb_representative', 
    'gtdb_type_species_of_genus', 'gtdb_taxonomy', 'ncbi_taxonomy', 
]

def extract_taxonomy(tax, tax_one_letter_code):
    for t in tax.split(';'):
        if t.startswith(f'{tax_one_letter_code}__'):
            return t
        
    
gtdb_metadata = pd.concat(
    [
        pd.read_csv(
            data_folder / 'gtdb_r214.1' / 'ar53_metadata_r214.tsv.gz', 
            sep='\t', 
            usecols=columns,
        )[columns].reset_index(drop=True),
        pd.read_csv(
            data_folder / 'gtdb_r214.1' / 'bac120_metadata_r214.tsv.gz', 
            sep='\t', 
            usecols=columns,
        )[columns].reset_index(drop=True)
    ],
    ignore_index=True,
)

gtdb_metadata['domain'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: extract_taxonomy(t, 'd'))
gtdb_metadata['phylum'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: extract_taxonomy(t, 'p'))
gtdb_metadata['genus'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: extract_taxonomy(t, 'g'))
gtdb_metadata['species'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: extract_taxonomy(t, 's'))
gtdb_metadata['gtdb_representative_b'] = gtdb_metadata['gtdb_representative'].apply(lambda v: v == 't')
gtdb_metadata['gtdb_type_species_of_genus_b'] = gtdb_metadata['gtdb_type_species_of_genus'].apply(lambda v: v == 't')

species_stats = gtdb_metadata[['species', 'accession']].groupby('species').count()
n_genomes_in_species = {
    species: species_stats.loc[species]['accession'] for species in species_stats.index
}
gtdb_metadata['n_genomes_in_species'] = gtdb_metadata['species'].apply(lambda s: n_genomes_in_species[s])

print(f'Number of genomes: {len(gtdb_metadata):,}')
gtdb_metadata.head()

Number of genomes: 402,709


Unnamed: 0,accession,ncbi_organism_name,checkm_completeness,checkm_contamination,coding_bases,coding_density,contig_count,gc_count,gc_percentage,genome_size,...,gtdb_type_species_of_genus,gtdb_taxonomy,ncbi_taxonomy,domain,phylum,genus,species,gtdb_representative_b,gtdb_type_species_of_genus_b,n_genomes_in_species
0,RS_GCF_000246985.2,Thermococcus litoralis DSM 5473,99.5,0.5,2014456,90.939033,1,954455,43.088029,2215172,...,f,d__Archaea;p__Methanobacteriota_B;c__Thermococ...,d__Archaea;p__Euryarchaeota;c__Thermococci;o__...,d__Archaea,p__Methanobacteriota_B,g__Thermococcus_A,s__Thermococcus_A alcaliphilus,False,False,3
1,RS_GCF_000980135.1,Methanosarcina mazei,100.0,0.65,3081035,75.560622,193,1694808,41.570885,4077567,...,f,d__Archaea;p__Halobacteriota;c__Methanosarcini...,d__Archaea;p__Euryarchaeota;c__Methanomicrobia...,d__Archaea,p__Halobacteriota,g__Methanosarcina,s__Methanosarcina mazei,False,False,78
2,RS_GCF_000337075.1,Halorubrum hochstenium ATCC 700873,99.38,0.19,2629910,86.580487,64,2099820,69.130537,3037532,...,f,d__Archaea;p__Halobacteriota;c__Halobacteria;o...,d__Archaea;p__Euryarchaeota;c__Halobacteria;o_...,d__Archaea,p__Halobacteriota,g__Halorubrum,s__Halorubrum hochstenium,True,False,2
3,RS_GCF_000979515.1,Methanosarcina mazei,100.0,1.31,3083899,75.65795,215,1696947,41.632859,4076107,...,f,d__Archaea;p__Halobacteriota;c__Methanosarcini...,d__Archaea;p__Euryarchaeota;c__Methanomicrobia...,d__Archaea,p__Halobacteriota,g__Methanosarcina,s__Methanosarcina mazei,False,False,78
4,RS_GCF_000762265.1,Methanobacterium formicicum,100.0,0.0,2045404,83.486321,1,1012813,41.339525,2449987,...,f,d__Archaea;p__Methanobacteriota;c__Methanobact...,d__Archaea;p__Euryarchaeota;c__Methanobacteria...,d__Archaea,p__Methanobacteriota,g__Methanobacterium,s__Methanobacterium formicicum,False,False,10


## Select genomes

- All genera
- Up to 10 species per genus
    - "type" species are selected in priority
    - otherwise, species with more strains are selected in priority
    - tiebreak: genome completeness

In [4]:
def make_selection(gtdb_metadata, n_species_per_genus=10):
    accession_list = []

    gtdb_metadata_sorted = gtdb_metadata.sort_values(
        [
            'genus', 
            'gtdb_type_species_of_genus_b', 
            'gtdb_representative_b',
            'n_genomes_in_species',
            'checkm_completeness',
            'checkm_contamination',
            'protein_count',
            'accession',
        ], 
        ascending=[
            True,  # genus
            False, # gtdb_type_species_of_genus_b: true first
            False, # gtdb_representative_b: true first
            False, # n_genomes_in_species: high number of genomes first
            False, # checkm_completeness: high completness first
            True,  # checkm_contamination: small contaminations first
            False, # protein_count: large genomes first
            True,  # accession: if else fails sort on accession
        ]
    ).reset_index(drop=True)

    # We make one exception for Halogranum salarium B-1 (Halogranum rubrum in GTDB).
    # The selected strain for this species is not the one we studied prior to building this DB.
    # We manually select the strain that we want included instead.
    hgr_rubrum_accession = 'RS_GCF_000283335.1'

    genera = gtdb_metadata_sorted['genus'].unique()
    for i, genus in enumerate(genera):
        if i == 0 or (i+1) % 10000 == 0 or (i+1) == len(genera):
            print(f'Genus {i+1:,} / {len(genera):,}')

        genus_df_sorted = gtdb_metadata_sorted[
            gtdb_metadata_sorted['genus'] == genus
        ].reset_index(drop=True)

        selection_df = genus_df_sorted.drop_duplicates(
            ['genus', 'species']
        ).head(n_species_per_genus)

        if genus == 'g__Halogranum':
            l = selection_df[
                selection_df['species'] != 's__Halogranum rubrum'
            ]['accession'].values.tolist()
            l.append(hgr_rubrum_accession)
            assert len(l) == 3
        else:
            l = selection_df['accession'].values.tolist()
        
        accession_list.extend(l)

    assert hgr_rubrum_accession in accession_list

    return accession_list
        

accessions = make_selection(gtdb_metadata)

Genus 1 / 20,739
Genus 10,000 / 20,739
Genus 20,000 / 20,739
Genus 20,739 / 20,739


## Stats

In [5]:
gtdb_metadata_shortlist = gtdb_metadata[gtdb_metadata['accession'].isin(accessions)].reset_index(drop=True)
gtdb_metadata_shortlist['domain'].value_counts()

domain
d__Bacteria    50713
d__Archaea      3708
Name: count, dtype: int64

Some of the accessions in GTDB release 214 did not exist on NCBI anymore, i.e. the final number of species is slightly down from the above:

- Archaea:   3,706
- Bacteria: 50,640

## Downloading genomes

Genomes were downloaded and processed using this package: https://github.com/srom/assembly
(created as part of this work, too).