# Streptococcus comparative genomics

Comparing protein domain presence when a specific doamin such as LysM is present vs when it's not.

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from Bio import Phylo

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

from src.cell_wall_binding_domains import cwb_domains

In [None]:
sns.set_theme(palette='colorblind', font_scale=1.3)
palette_colorblind = sns.color_palette('colorblind').as_hex()
palette_pastel = sns.color_palette('pastel').as_hex()

data_folder = Path('./data/')
assert data_folder.is_dir()

db_proka = Path('../db_proka/')
assert db_proka.is_dir()

gtdb_folder = Path('../data/gtdb_r220/')
assert gtdb_folder.is_dir()

strep_folder = gtdb_folder / 'Streptococcus'
assert strep_folder.is_dir()

## Load data

In [3]:
metadata_df = pd.read_csv(strep_folder / 'genomes_metadata.csv', index_col='assembly_accession')
metadata_df.head()

Unnamed: 0_level_0,accession,ambiguous_bases,checkm2_completeness,checkm2_contamination,checkm2_model,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_900636555.1,RS_GCF_900636555.1,0,100.0,0.14,Specific,100.0,0.0,475,o__Lactobacillales (UID544),267,...,19,59,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus gordonii
GCF_015908985.1,RS_GCF_015908985.1,0,100.0,0.23,Specific,100.0,0.0,475,o__Lactobacillales (UID544),267,...,19,56,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus suis
GCF_001266635.1,RS_GCF_001266635.1,0,100.0,0.06,Specific,99.82,0.0,524,f__Streptococcaceae (UID545),282,...,18,63,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus agalactiae
GCF_004154025.1,RS_GCF_004154025.1,0,99.99,0.13,Specific,99.85,0.0,676,g__Streptococcus (UID722),182,...,19,67,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus pyogenes
GCF_029011635.1,RS_GCF_029011635.1,0,100.0,0.2,Specific,100.0,0.18,524,f__Streptococcaceae (UID545),282,...,19,80,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus agalactiae


In [4]:
pfam_df = pd.read_csv(strep_folder / 'Streptococcus_all_proteins.pfam.csv', index_col='assembly_accession')
tigr_df = pd.read_csv(strep_folder / 'Streptococcus_all_proteins.tigr.csv', index_col='assembly_accession')

## LysM in S. equi

We spotted a pattern of differential presence of LysM in certain strains of _Steptococcus equi_.

In [44]:
accessions_s_equi = metadata_df[
    metadata_df['gtdb_species'] == 'Streptococcus equi'
].index

pfam_s_equi = pfam_df.loc[accessions_s_equi]
tigr_s_equi = tigr_df.loc[accessions_s_equi]

accessions_yes_LysM = set(pfam_s_equi[
    pfam_s_equi['hmm_query'] == 'LysM'
].index)

accessions_no_LysM = set(accessions_s_equi) - accessions_yes_LysM

print(f'Number of S. equi genomes:              {len(accessions_s_equi):,}')
print(f'Number of S. equi genones with LysM:    {len(accessions_yes_LysM):,}')
print(f'Number of S. equi genones without LysM: {len(accessions_no_LysM):,}')

Number of S. equi genomes:              38
Number of S. equi genones with LysM:    4
Number of S. equi genones without LysM: 34


In [45]:
extra_in_no = (
    (
        set(pfam_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values)
    ) - 
    (
        set(pfam_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values)
    )
)
len(extra_in_no)

263

In [46]:
pfam_no_lysM = pfam_s_equi.loc[sorted(accessions_no_LysM)]
tigr_no_lysM = tigr_s_equi.loc[sorted(accessions_no_LysM)]

domain_with_count = []
for domain in extra_in_no:
    if 'TIGR' in domain:
        df = tigr_no_lysM
    else:
        df = pfam_no_lysM

    count = len(set(df[df['hmm_query'] == domain].index))
    percent = float(np.round(100 * count / len(accessions_no_LysM), 1))

    domain_with_count.append((domain, count, percent))

domain_with_count = sorted(domain_with_count, key=lambda t: t[1], reverse=True)
domain_with_count[:20]

[('TIGR03426', 26, 76.5),
 ('Malt_amylase_C', 24, 70.6),
 ('AAA_10', 21, 61.8),
 ('FokI_D3', 21, 61.8),
 ('FokI_D1', 21, 61.8),
 ('FokI_dom_2', 21, 61.8),
 ('FokI_cleav_dom', 21, 61.8),
 ('MethyltransfD12', 20, 58.8),
 ('Polysacc_synt_3', 20, 58.8),
 ('TIGR00571', 19, 55.9),
 ('Phage_int_SAM_5', 19, 55.9),
 ('Gp58', 18, 52.9),
 ('Cellulase', 17, 50.0),
 ('RloB', 16, 47.1),
 ('T6_Ig_like', 16, 47.1),
 ('HTH_36', 16, 47.1),
 ('TIGR01637', 15, 44.1),
 ('TIGR01451', 15, 44.1),
 ('DUF1617', 14, 41.2),
 ('GA', 14, 41.2)]

In [47]:
missing_in_no = (
    (
        set(pfam_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values)
    ) - 
    (
        set(pfam_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values)
    )
)
len(missing_in_no)

16

In [48]:
pfam_yes_lysM = pfam_s_equi.loc[sorted(accessions_yes_LysM)]
tigr_yes_lysM = tigr_s_equi.loc[sorted(accessions_yes_LysM)]

missing_domain_with_count = []
for domain in missing_in_no:
    if 'TIGR' in domain:
        df = tigr_yes_lysM
    else:
        df = pfam_yes_lysM

    count = len(set(df[df['hmm_query'] == domain].index))
    percent = float(np.round(100 * count / len(accessions_yes_LysM), 1))

    missing_domain_with_count.append((domain, count, percent))

missing_domain_with_count = sorted(missing_domain_with_count, key=lambda t: t[1], reverse=True)
missing_domain_with_count[:20]

[('DUF6900', 4, 100.0),
 ('Amidase_3', 4, 100.0),
 ('LysM', 4, 100.0),
 ('DUF2326', 3, 75.0),
 ('MC7', 3, 75.0),
 ('CTD11', 3, 75.0),
 ('PhdYeFM_antitox', 3, 75.0),
 ('ToxN_toxin', 3, 75.0),
 ('SLFN-g3_helicase', 3, 75.0),
 ('MeaB', 3, 75.0),
 ('DEDD_Tnp_IS110', 2, 50.0),
 ('HNH_3', 1, 25.0),
 ('Imm63', 1, 25.0),
 ('DUF4145', 1, 25.0),
 ('DUF3781', 1, 25.0),
 ('TrbL', 1, 25.0)]

Not particularly conclusive at this level of analysis.