# Bacterial dataset #1

## Protein domains

For all bacteria in `db_proka`:

- Retrieve all Pfam & TIGR domains present in at least 10% of all genomes (~5,000 out of 50,000)
- Binarization (presence or absence in genome)

## Phylogeny

- Create binary variables for all `gtdb_order`

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

In [2]:
sns.set_theme(palette='colorblind', font_scale=1.3)
palette = sns.color_palette().as_hex()

data_folder = Path('./data/')
assert data_folder.is_dir()

db_proka = Path('../db_proka/')
assert db_proka.is_dir()

## Load GTDB metadata file

In [3]:
gtdb_metadata = pd.read_csv(db_proka / 'gtdb_metadata.csv')
gtdb_metadata['domain'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[0].replace('d__', ''))
gtdb_metadata['gtdb_phylum'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[1].replace('p__', ''))
gtdb_metadata['gtdb_class'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[2].replace('c__', ''))
gtdb_metadata['gtdb_order'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[3].replace('o__', ''))
gtdb_metadata['gtdb_family'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[4].replace('f__', ''))
gtdb_metadata['gtdb_genus'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[5].replace('g__', ''))
gtdb_metadata['gtdb_species'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[6].replace('s__', ''))
gtdb_metadata = gtdb_metadata.set_index('ncbi_accession', drop=True)
bacterial_genomes = gtdb_metadata[gtdb_metadata['domain'] == 'Bacteria']

In [4]:
n_bacteria = len(bacterial_genomes)
print(f'Number of bacterial genomes: {n_bacteria:,}')

n_phyla = len(bacterial_genomes['gtdb_phylum'].unique())
print(f'Number of phyla: {n_phyla:,}')

n_class = len(bacterial_genomes['gtdb_class'].unique())
print(f'Number of class: {n_class:,}')

n_order = len(bacterial_genomes['gtdb_order'].unique())
print(f'Number of order: {n_order:,}')

n_families = len(bacterial_genomes['gtdb_family'].unique())
print(f'Number of families: {n_families:,}')

n_genera = len(bacterial_genomes['gtdb_genus'].unique())
print(f'Number of genera: {n_genera:,}')

n_species = len(bacterial_genomes['gtdb_species'].unique())
print(f'Number of species: {n_species:,}')

Number of bacterial genomes: 50,640
Number of phyla: 181
Number of class: 490
Number of order: 1,653
Number of families: 4,304
Number of genera: 19,144
Number of species: 50,640


### Genome selection

Keep only genomes at least 90% complete.

In [5]:
bacterial_index = bacterial_genomes[bacterial_genomes['checkm_completeness'] > 90].index

p = 100 * len(bacterial_index) / n_bacteria
print(f'Number of genomes at least 90% complete: {len(bacterial_index):,} out of {n_bacteria:,} ({p:.0f} %)')

Number of genomes at least 90% complete: 32,549 out of 50,700 (64 %)


## Process Pfam summary table

In [10]:
pfam_summary = pd.read_csv(db_proka / 'Pfam-A_summary.tsv.gz', sep='\t', index_col='assembly_accession')

In [7]:
pfam_summary = pfam_summary.drop(columns=['asm_name'])

pfam_bacteria = pfam_summary.loc[
    sorted(set(pfam_summary.index) & set(bacterial_index))
]

pfam_domains = pfam_bacteria.columns

print(f'Number bacterial records in Pfam: {len(pfam_bacteria):,}')
print(f'Number Pfam domains: {len(pfam_domains):,}')

Number bacterial records in Pfam: 32,507
Number Pfam domains: 19,632


### Domain selection

Get domains present at least in 10% of all genomes & at least not in one genome.

In [8]:
sum_df = (pfam_bacteria > 0).sum()
relevant_domains = pfam_bacteria.columns[
    (sum_df >= 0.1 * len(pfam_bacteria)) &
    (sum_df  < 1.0 * len(pfam_bacteria))
]
print(f'Number of Pfam domains present in at least 10% of all genomes: {len(relevant_domains):,}')

Number of Pfam domains present in at least 10% of all genomes: 3,981


In [9]:
binarized_dataset = (pfam_bacteria[relevant_domains] > 0).astype(int)
binarized_dataset.to_csv(data_folder / 'pfam_bacteria.csv')

## Process TIGR summary table

In [10]:
tigr_summary = pd.read_csv(db_proka / 'TIGR_summary.tsv.gz', sep='\t', index_col='assembly_accession')

tigr_summary = tigr_summary.drop(columns=['asm_name'])

tigr_bacteria = tigr_summary.loc[
    sorted(set(tigr_summary.index) & set(bacterial_index))
]

tigr_domains = tigr_bacteria.columns

print(f'Number bacterial records in TIGR: {len(tigr_bacteria):,}')
print(f'Number TIGR domains: {len(tigr_domains):,}')

Number bacterial records in TIGR: 32,507
Number TIGR domains: 4,488


In [11]:
TIGR_sum_df = (tigr_bacteria > 0).sum()
relevant_TIGR_domains = tigr_bacteria.columns[
    (TIGR_sum_df >= 0.1 * len(tigr_bacteria)) &
    (TIGR_sum_df  < 1.0 * len(tigr_bacteria))
]
print(f'Number of TIGR domains present in at least 10% of all genomes: {len(relevant_TIGR_domains):,}')

Number of TIGR domains present in at least 10% of all genomes: 1,551


In [12]:
TIGR_binarized_dataset = (tigr_bacteria[relevant_TIGR_domains] > 0).astype(int)
TIGR_binarized_dataset.to_csv(data_folder / 'TIGR_bacteria.csv')

## Phylogeny

In [6]:
tax_order_df = gtdb_metadata.loc[bacterial_index][['gtdb_order']]
tax_order_df.head()

Unnamed: 0_level_0,gtdb_order
ncbi_accession,Unnamed: 1_level_1
GCA_002774355.1,UBA2242
GCA_002779395.1,0-14-0-80-59-12
GCA_018897875.1,Desulfobaccales
GCA_016875015.1,Desulfobaccales
GCA_023229795.1,Desulfobaccales


In [7]:
tax_order_bin = pd.get_dummies(
    tax_order_df['gtdb_order'], 
    prefix='o', 
    prefix_sep='_',
).astype(int)
tax_order_bin.index = tax_order_df.index
tax_order_bin = tax_order_bin.reset_index().rename(
    columns={'ncbi_accession': 'assembly_accession'}
).set_index('assembly_accession', drop=True)

In [8]:
tax_order_bin.to_csv(data_folder / 'taxonomy_order_bacteria.csv')

### Full taxonomy

In [14]:
full_taxonomy = gtdb_metadata.loc[bacterial_index][[
    'gtdb_phylum', 'gtdb_class', 'gtdb_order', 'gtdb_family', 'gtdb_genus', 'gtdb_species'
]]

tax_col_and_prefixes = [
    ('gtdb_phylum', 'p__'), 
    ('gtdb_class', 'c__'), 
    ('gtdb_order', 'o__'), 
    ('gtdb_family', 'f__'), 
    ('gtdb_genus', 'g__'), 
    ('gtdb_species', 's__'),
]

for tax_col, prefix in tax_col_and_prefixes:
    full_taxonomy[tax_col] = full_taxonomy[tax_col].apply(
        lambda t: f'{prefix}{t}'
    )

full_taxonomy.head()

Unnamed: 0_level_0,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
ncbi_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GCA_002774355.1,p__Marinisomatota,c__UBA2242,o__UBA2242,f__UBA2242,g__0-14-0-20-45-22,s__0-14-0-20-45-22 sp002774355
GCA_002779395.1,p__Bacteroidota,c__0-14-0-80-59-12,o__0-14-0-80-59-12,f__0-14-0-80-59-12,g__0-14-0-80-59-12,s__0-14-0-80-59-12 sp002779395
GCA_018897875.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp018897875
GCA_016875015.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp016875015
GCA_023229795.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp023229795


In [15]:
full_taxonomy = full_taxonomy.reset_index().rename(
    columns={'ncbi_accession': 'assembly_accession'}
).set_index('assembly_accession', drop=True)
full_taxonomy.head()

Unnamed: 0_level_0,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GCA_002774355.1,p__Marinisomatota,c__UBA2242,o__UBA2242,f__UBA2242,g__0-14-0-20-45-22,s__0-14-0-20-45-22 sp002774355
GCA_002779395.1,p__Bacteroidota,c__0-14-0-80-59-12,o__0-14-0-80-59-12,f__0-14-0-80-59-12,g__0-14-0-80-59-12,s__0-14-0-80-59-12 sp002779395
GCA_018897875.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp018897875
GCA_016875015.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp016875015
GCA_023229795.1,p__Desulfobacterota,c__Desulfobaccia,o__Desulfobaccales,f__0-14-0-80-60-11,g__0-14-0-80-60-11,s__0-14-0-80-60-11 sp023229795


In [16]:
full_taxonomy.to_csv(data_folder / 'taxonomy_bacteria.csv')