# Differential presence of CWB domains at genus level

Check for the presence of cell wall binding domains (CWB) in monoderm genera. 

In particular, we are interested in cases where most species, but not all, encode a specific CWB domain.

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

In [2]:
sns.set_theme(palette='colorblind', font_scale=1.3)
palette = sns.color_palette().as_hex()

data_folder = Path('./data/')
assert data_folder.is_dir()

db_proka = Path('../db_proka/')
assert db_proka.is_dir()

## Load GTDB metadata file

In [24]:
gtdb_metadata = pd.read_csv(db_proka / 'gtdb_metadata.csv')
gtdb_metadata['domain'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[0].replace('d__', ''))
gtdb_metadata['gtdb_phylum'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[1].replace('p__', ''))
gtdb_metadata['gtdb_class'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[2].replace('c__', ''))
gtdb_metadata['gtdb_order'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[3].replace('o__', ''))
gtdb_metadata['gtdb_family'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[4].replace('f__', ''))
gtdb_metadata['gtdb_genus'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[5].replace('g__', ''))
gtdb_metadata['gtdb_species'] = gtdb_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[6].replace('s__', ''))
gtdb_metadata = gtdb_metadata.set_index('ncbi_accession', drop=True)

## Focus on Bacillota (Firmicutes) & Actinomycetota

In [80]:
phylum_seeds = ['Bacillota', 'Actinomycetota']
monoderm_phyla = sorted([
    p for p in gtdb_metadata['gtdb_phylum'].unique()
    if any([s in p for s in phylum_seeds])
])

# Monoderms with complete genomes:
monoderms_cg = gtdb_metadata[
    (gtdb_metadata['gtdb_phylum'].isin(monoderm_phyla)) &
    (
        (gtdb_metadata['ncbi_assembly_level']  == 'Complete Genome') |
        (gtdb_metadata['checkm_completeness'] > 98)
    )
]

print(f'Number of species: {len(monoderms_cg):,}')
print(f'Number of genera : {len(monoderms_cg["gtdb_genus"].unique()):,}')

Number of species: 5,475
Number of genera : 1,932


In [83]:
n_species_per_genus = monoderms_cg.reset_index()[
    ['gtdb_genus', 'ncbi_accession']
].groupby('gtdb_genus').nunique().sort_values(['ncbi_accession'], ascending=False)


In [84]:
candidate_genera = sorted(n_species_per_genus[n_species_per_genus['ncbi_accession'] == 10].index.tolist())

print(f'Number of genera with 10 complete genomes or more: {len(candidate_genera)}')

candidate_genera

Number of genera with 10 complete genomes or more: 106


['Actinokineospora',
 'Actinoplanes',
 'Actinosynnema',
 'Aerococcus',
 'Aeromicrobium',
 'Agrococcus',
 'Alicyclobacillus',
 'Amycolatopsis',
 'Anaerobacillus_A',
 'Arthrobacter_D',
 'Bacillus',
 'Bacillus_A',
 'Blautia_A',
 'Brevibacillus',
 'Carnobacterium_A',
 'Cellulomonas',
 'Cellulosimicrobium',
 'Choladocola',
 'Clostridium',
 'Clostridium_AD',
 'Clostridium_B',
 'Corynebacterium',
 'Cryobacterium',
 'DTU059',
 'Domibacillus',
 'Enterocloster',
 'Enterococcus',
 'Enterococcus_A',
 'Enterococcus_B',
 'Exiguobacterium',
 'Exiguobacterium_A',
 'Faecalibacterium',
 'Fictibacillus',
 'Frigoribacterium',
 'Galliscardovia',
 'Geodermatophilus',
 'Glutamicibacter',
 'Glycomyces',
 'Gordonia',
 'Gottfriedia',
 'Gracilibacillus',
 'Halanaerobium',
 'Halobacillus',
 'Halobacillus_A',
 'Herbiconiux',
 'Isoptericola',
 'Jeotgalibacillus',
 'Kocuria',
 'Kribbella',
 'Lactiplantibacillus',
 'Lactobacillus',
 'Leifsonia',
 'Lentibacillus',
 'Lentilactobacillus',
 'Lentzea',
 'Leucobacter',
 'L

## Cell wall binding domains landscape

In [49]:
cwb_domains = [
    'PG_binding_1',
    'PG_binding_2',
    'PG_binding_3',
    'AMIN',
    'SPOR',
    'SH3_1',
    'SH3_2',
    'SH3_3',
    'SH3_4',
    'SH3_5',
    'SH3_6',
    'SH3_7',
    'SH3_9',
    'SH3_16',
    'LysM',
    'Choline_bind_1',
    'Choline_bind_2',
    'Choline_bind_3',
    'CW_binding_2',
    'CW_7',
    'PSA_CBD',
    'ZoocinA_TRD',
    'GW',
    'OapA',
]

In [50]:
## Loading Pfam summary, a large table, which may take a while (but should take less than 5 minutes).

cwb_summary = pd.read_csv(
    db_proka / 'Pfam-A_summary.tsv.gz', 
    sep='\t', 
    index_col='assembly_accession',
).drop(columns=['asm_name'])[cwb_domains]

In [85]:
taxonomy_columns = [
    'domain',
    'gtdb_phylum',
    'gtdb_class',
    'gtdb_order',
    'gtdb_family',
    'gtdb_genus',
    'gtdb_species',
]

dfs = []
for genus in candidate_genera:
    accessions = monoderms_cg[monoderms_cg['gtdb_genus'] == genus].index.tolist()

    df = cwb_summary.loc[accessions]

    for cwb_col in cwb_domains:
        df[cwb_col] = (df[cwb_col] > 0).astype(int)

    cols = df.columns.tolist()

    for tax_col in taxonomy_columns:
        df[tax_col] = [monoderms_cg.loc[accession, tax_col] for accession in df.index]

    dfs.append(
        df[taxonomy_columns + cols].reset_index()
    )

cwb_df = pd.concat(dfs, ignore_index=True).set_index('assembly_accession', drop=True)
cwb_df.head()

Unnamed: 0_level_0,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,PG_binding_1,PG_binding_2,PG_binding_3,...,LysM,Choline_bind_1,Choline_bind_2,Choline_bind_3,CW_binding_2,CW_7,PSA_CBD,ZoocinA_TRD,GW,OapA
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000564855.1,Bacteria,Actinomycetota,Actinomycetia,Mycobacteriales,Pseudonocardiaceae,Actinokineospora,Actinokineospora spheciospongiae,1,0,0,...,1,0,0,0,0,0,0,0,0,0
GCF_001940455.1,Bacteria,Actinomycetota,Actinomycetia,Mycobacteriales,Pseudonocardiaceae,Actinokineospora,Actinokineospora bangkokensis,1,0,0,...,1,0,0,0,0,0,0,0,0,0
GCF_002934265.1,Bacteria,Actinomycetota,Actinomycetia,Mycobacteriales,Pseudonocardiaceae,Actinokineospora,Actinokineospora auranticolor,1,1,0,...,1,0,0,0,0,0,0,0,0,0
GCF_004362515.1,Bacteria,Actinomycetota,Actinomycetia,Mycobacteriales,Pseudonocardiaceae,Actinokineospora,Actinokineospora alba,1,1,0,...,1,0,0,0,0,0,0,0,0,0
GCF_014323725.1,Bacteria,Actinomycetota,Actinomycetia,Mycobacteriales,Pseudonocardiaceae,Actinokineospora,Actinokineospora xionganensis,1,1,0,...,1,0,0,0,0,0,0,0,0,0


In [86]:
agg_dict = {'assembly_accession': 'nunique'}
for cwb_col in cwb_domains:
    agg_dict[cwb_col] = 'sum'

cwb_genus_summary = cwb_df.reset_index()[
    ['gtdb_phylum', 'gtdb_genus', 'assembly_accession'] + cwb_domains
].groupby(
    ['gtdb_phylum', 'gtdb_genus']
).agg(agg_dict).rename(
    columns={'assembly_accession': 'n_species'}
)

cwb_genus_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,n_species,PG_binding_1,PG_binding_2,PG_binding_3,AMIN,SPOR,SH3_1,SH3_2,SH3_3,SH3_4,...,LysM,Choline_bind_1,Choline_bind_2,Choline_bind_3,CW_binding_2,CW_7,PSA_CBD,ZoocinA_TRD,GW,OapA
gtdb_phylum,gtdb_genus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Actinomycetota,Actinokineospora,10,10,6,0,0,1,0,0,0,0,...,10,0,0,0,1,0,0,0,0,0
Actinomycetota,Actinoplanes,10,10,6,4,0,0,0,0,1,0,...,10,0,0,0,0,0,0,0,0,0
Actinomycetota,Actinosynnema,10,10,9,0,0,1,0,0,3,0,...,10,0,0,0,0,0,0,0,0,0
Actinomycetota,Aeromicrobium,10,7,0,0,0,0,0,0,7,0,...,10,0,0,0,0,0,0,0,0,0
Actinomycetota,Agrococcus,10,6,0,1,0,0,0,0,0,0,...,9,0,0,0,8,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bacillota_A,Faecalibacterium,10,10,0,0,0,1,0,0,3,2,...,10,0,0,2,0,0,0,0,0,0
Bacillota_A,Mobilitalea,10,10,2,0,0,7,0,0,10,7,...,10,4,1,4,1,0,0,0,2,0
Bacillota_A,Peptoniphilus_A,10,0,0,0,0,0,0,0,6,0,...,10,0,0,0,1,3,0,0,0,0
Bacillota_A,Peptostreptococcus,10,0,0,0,0,3,1,0,10,10,...,3,10,1,10,10,0,0,0,0,0


In [87]:
cwb_df.to_csv(data_folder / 'differential_presence' / 'cwb_presence_assembly.csv')
cwb_genus_summary.to_csv(data_folder / 'differential_presence' / 'cwb_presence_genus.csv')

In [92]:
cwb_df[cwb_df['gtdb_genus'] == 'Streptococcus'][taxonomy_columns + ['LysM', 'PG_binding_1']]

Unnamed: 0_level_0,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,LysM,PG_binding_1
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GCA_002831545.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus suis_P,1,0
GCA_900637675.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus equi,0,0
GCF_000186445.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus agalactiae,1,0
GCF_000294495.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus suis,1,0
GCF_000785515.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius,1,0
GCF_001457635.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus pneumoniae,1,0
GCF_002055535.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus pyogenes,1,0
GCF_006739205.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus mutans,1,0
GCF_010120595.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus thermophilus,1,0
GCF_902702775.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus suis_W,1,0


In [91]:
gtdb_metadata.loc['GCA_900637675.1'][['gtdb_species', 'ncbi_assembly_level', 'checkm_completeness']]

gtdb_species           Streptococcus equi
ncbi_assembly_level       Complete Genome
checkm_completeness                 98.06
Name: GCA_900637675.1, dtype: object

In [104]:
cwb_df[cwb_df['gtdb_genus'] == 'Lactobacillus'][taxonomy_columns + ['LysM', 'PG_binding_1']]

Unnamed: 0_level_0,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,LysM,PG_binding_1
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GCF_000014425.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus gasseri,1,1
GCF_000159355.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus johnsonii,1,1
GCF_000160855.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus helveticus,1,0
GCF_000160875.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus iners,0,1
GCF_001433875.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus delbrueckii,1,0
GCF_001436455.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus jensenii,1,0
GCF_001591845.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus acidophilus,1,0
GCF_002706375.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus amylovorus,1,0
GCF_003584685.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus paragasseri,1,1
GCF_018987235.1,Bacteria,Bacillota,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus crispatus,1,1
