# Streptococcus comparative genomics

Comparing protein domain presence when a specific doamin such as LysM is present vs when it's not.

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from Bio import Phylo

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

from src.cell_wall_binding_domains import cwb_domains

In [2]:
sns.set_theme(palette='colorblind', font_scale=1.3)
palette_colorblind = sns.color_palette('colorblind').as_hex()
palette_pastel = sns.color_palette('pastel').as_hex()

data_folder = Path('./data/')
assert data_folder.is_dir()

db_proka = Path('../db_proka/')
assert db_proka.is_dir()

gtdb_folder = Path('../data/gtdb_r220/')
assert gtdb_folder.is_dir()

strep_folder = gtdb_folder / 'Streptococcus'
assert strep_folder.is_dir()

## Load data

In [3]:
metadata_df = pd.read_csv(strep_folder / 'genomes_metadata.csv', index_col='assembly_accession')
metadata_df.head()

Unnamed: 0_level_0,accession,ambiguous_bases,checkm2_completeness,checkm2_contamination,checkm2_model,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_900636555.1,RS_GCF_900636555.1,0,100.0,0.14,Specific,100.0,0.0,475,o__Lactobacillales (UID544),267,...,19,59,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus gordonii
GCF_015908985.1,RS_GCF_015908985.1,0,100.0,0.23,Specific,100.0,0.0,475,o__Lactobacillales (UID544),267,...,19,56,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus suis
GCF_001266635.1,RS_GCF_001266635.1,0,100.0,0.06,Specific,99.82,0.0,524,f__Streptococcaceae (UID545),282,...,18,63,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus agalactiae
GCF_004154025.1,RS_GCF_004154025.1,0,99.99,0.13,Specific,99.85,0.0,676,g__Streptococcus (UID722),182,...,19,67,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus pyogenes
GCF_029011635.1,RS_GCF_029011635.1,0,100.0,0.2,Specific,100.0,0.18,524,f__Streptococcaceae (UID545),282,...,19,80,0,Bacteria,Bacillota,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus agalactiae


In [4]:
pfam_df = pd.read_csv(strep_folder / 'Streptococcus_all_proteins.pfam.csv', index_col='assembly_accession')
pfam_df['gtdb_species'] = [metadata_df.loc[a, 'gtdb_species'] for a in pfam_df.index]

tigr_df = pd.read_csv(strep_folder / 'Streptococcus_all_proteins.tigr.csv', index_col='assembly_accession')
tigr_df['gtdb_species'] = [metadata_df.loc[a, 'gtdb_species'] for a in tigr_df.index]

## How many LysM per genomes?

In [5]:
lysM_df = pfam_df[pfam_df['hmm_query'] == 'LysM'].copy()

n_copies_per_species = lysM_df.reset_index()[
    ['gtdb_species', 'id', 'assembly_accession']
].groupby('gtdb_species').nunique()

n_copies_per_species['ratio'] = n_copies_per_species['id'] / n_copies_per_species['assembly_accession']
n_copies_per_species.sort_values('assembly_accession', ascending=False).head(10)

Unnamed: 0_level_0,id,assembly_accession,ratio
gtdb_species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Streptococcus pyogenes,271,271,1.0
Streptococcus agalactiae,611,197,3.101523
Streptococcus pneumoniae,313,158,1.981013
Streptococcus suis,268,109,2.458716
Streptococcus thermophilus,310,88,3.522727
Streptococcus dysgalactiae,57,43,1.325581
Streptococcus mutans,68,26,2.615385
Streptococcus iniae,14,14,1.0
Streptococcus suis_W,39,13,3.0
Streptococcus gordonii,52,13,4.0


In [6]:
s_pyogenes_repr = 'GCF_002055535.1'
df = pfam_df.loc[s_pyogenes_repr]
df[df['hmm_query'] == 'LysM']

Unnamed: 0_level_0,id,protein_id,hmm_accession,hmm_query,evalue,bitscore,accuracy,start,end,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GCF_002055535.1,WP_010921965.1@GCF_002055535.1,WP_010921965.1,PF01476.25,LysM,1.6e-10,37.6,0.97,48,91,Streptococcus pyogenes


## LysM in S. equi

We spotted a pattern of differential presence of LysM in certain strains of _Steptococcus equi_.

In [7]:
accessions_s_equi = metadata_df[
    metadata_df['gtdb_species'] == 'Streptococcus equi'
].index

pfam_s_equi = pfam_df.loc[accessions_s_equi]
tigr_s_equi = tigr_df.loc[accessions_s_equi]

accessions_yes_LysM = set(pfam_s_equi[
    pfam_s_equi['hmm_query'] == 'LysM'
].index)

accessions_no_LysM = set(accessions_s_equi) - accessions_yes_LysM

print(f'Number of S. equi genomes:              {len(accessions_s_equi):,}')
print(f'Number of S. equi genones with LysM:    {len(accessions_yes_LysM):,}')
print(f'Number of S. equi genones without LysM: {len(accessions_no_LysM):,}')

Number of S. equi genomes:              38
Number of S. equi genones with LysM:    4
Number of S. equi genones without LysM: 34


In [8]:
extra_in_no = (
    (
        set(pfam_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values)
    ) - 
    (
        set(pfam_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values)
    )
)
len(extra_in_no)

263

In [9]:
pfam_no_lysM = pfam_s_equi.loc[sorted(accessions_no_LysM)]
tigr_no_lysM = tigr_s_equi.loc[sorted(accessions_no_LysM)]

domain_with_count = []
for domain in extra_in_no:
    if 'TIGR' in domain:
        df = tigr_no_lysM
    else:
        df = pfam_no_lysM

    count = len(set(df[df['hmm_query'] == domain].index))
    percent = float(np.round(100 * count / len(accessions_no_LysM), 1))

    domain_with_count.append((domain, count, percent))

domain_with_count = sorted(domain_with_count, key=lambda t: t[1], reverse=True)
domain_with_count[:20]

[('TIGR03426', 26, 76.5),
 ('Malt_amylase_C', 24, 70.6),
 ('FokI_D1', 21, 61.8),
 ('FokI_dom_2', 21, 61.8),
 ('AAA_10', 21, 61.8),
 ('FokI_D3', 21, 61.8),
 ('FokI_cleav_dom', 21, 61.8),
 ('MethyltransfD12', 20, 58.8),
 ('Polysacc_synt_3', 20, 58.8),
 ('TIGR00571', 19, 55.9),
 ('Phage_int_SAM_5', 19, 55.9),
 ('Gp58', 18, 52.9),
 ('Cellulase', 17, 50.0),
 ('RloB', 16, 47.1),
 ('HTH_36', 16, 47.1),
 ('T6_Ig_like', 16, 47.1),
 ('TIGR01637', 15, 44.1),
 ('TIGR01451', 15, 44.1),
 ('Bro-N', 14, 41.2),
 ('AAA_13', 14, 41.2)]

In [10]:
missing_in_no = (
    (
        set(pfam_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_yes_LysM)]['hmm_query'].values)
    ) - 
    (
        set(pfam_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values) |
        set(tigr_s_equi.loc[sorted(accessions_no_LysM)]['hmm_query'].values)
    )
)
len(missing_in_no)

16

In [11]:
pfam_yes_lysM = pfam_s_equi.loc[sorted(accessions_yes_LysM)]
tigr_yes_lysM = tigr_s_equi.loc[sorted(accessions_yes_LysM)]

missing_domain_with_count = []
for domain in missing_in_no:
    if 'TIGR' in domain:
        df = tigr_yes_lysM
    else:
        df = pfam_yes_lysM

    count = len(set(df[df['hmm_query'] == domain].index))
    percent = float(np.round(100 * count / len(accessions_yes_LysM), 1))

    missing_domain_with_count.append((domain, count, percent))

missing_domain_with_count = sorted(missing_domain_with_count, key=lambda t: t[1], reverse=True)
missing_domain_with_count[:20]

[('Amidase_3', 4, 100.0),
 ('LysM', 4, 100.0),
 ('DUF6900', 4, 100.0),
 ('SLFN-g3_helicase', 3, 75.0),
 ('CTD11', 3, 75.0),
 ('PhdYeFM_antitox', 3, 75.0),
 ('ToxN_toxin', 3, 75.0),
 ('MeaB', 3, 75.0),
 ('MC7', 3, 75.0),
 ('DUF2326', 3, 75.0),
 ('DEDD_Tnp_IS110', 2, 50.0),
 ('DUF3781', 1, 25.0),
 ('DUF4145', 1, 25.0),
 ('HNH_3', 1, 25.0),
 ('Imm63', 1, 25.0),
 ('TrbL', 1, 25.0)]

Not particularly conclusive at this level of analysis.

## Cell wall HMM search

HMM models from [Megrian et al., 2022](https://doi.org/10.1038/s41564-022-01257-y).

In [12]:
cw_df = pd.read_csv(strep_folder / 'Streptococcus_all_proteins.cell_wall.csv', index_col='assembly_accession')

# Keep only one model per protein – best scoring one.
cw_df = cw_df.sort_values(['id', 'bitscore'], ascending=[True, False]).drop_duplicates(['id'])

cw_df.head()

Unnamed: 0_level_0,id,protein_id,hmm_accession,hmm_query,evalue,bitscore,accuracy,start,end
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GCA_000013525.1,ABF35082.1@GCA_000013525.1,ABF35082.1,newDdlB,newDdlB,6.5e-08,28.6,0.77,126,303
GCA_000013525.1,ABF35084.1@GCA_000013525.1,ABF35084.1,newDdlB,newDdlB,3.5e-11,39.3,0.79,106,289
GCA_000013525.1,ABF35137.1@GCA_000013525.1,ABF35137.1,newFtsI,newFtsI,1.9e-21,73.0,0.79,329,642
GCA_000013525.1,ABF35311.1@GCA_000013525.1,ABF35311.1,newMraY,newMraY,1.4e-35,119.7,0.79,0,330
GCA_000013525.1,ABF35345.1@GCA_000013525.1,ABF35345.1,newMurD,newMurD,3.8e-05,19.8,0.77,0,117


In [13]:
cw_df_yes_lysM = cw_df.loc[sorted(accessions_yes_LysM)]

cw_yes_lysM_grouped = cw_df_yes_lysM[
    ['id', 'hmm_query']
].groupby(
    'hmm_query'
).nunique().sort_values('id', ascending=False).rename(
    columns={
        'id': 'count',
    }
)
cw_yes_lysM_grouped['ratio'] = cw_yes_lysM_grouped['count'] / len(accessions_yes_LysM)
cw_yes_lysM_grouped

Unnamed: 0_level_0,count,ratio
hmm_query,Unnamed: 1_level_1,Unnamed: 2_level_1
newDdlB,20,5.0
newFtsI,20,5.0
newMraW,12,3.0
newMurA,12,3.0
newMurD,12,3.0
newFtsA,8,2.0
newFtsW,8,2.0
newMraY,8,2.0
newMurE,8,2.0
newFtsZ,4,1.0


In [14]:
cw_df_no_lysM = cw_df.loc[sorted(accessions_no_LysM)]

cw_no_lysM_grouped = cw_df_no_lysM[
    ['id', 'hmm_query']
].groupby(
    'hmm_query'
).nunique().sort_values('id', ascending=False).rename(
    columns={
        'id': 'count',
    }
)
cw_no_lysM_grouped['ratio'] = cw_no_lysM_grouped['count'] / len(accessions_no_LysM)
cw_no_lysM_grouped

Unnamed: 0_level_0,count,ratio
hmm_query,Unnamed: 1_level_1,Unnamed: 2_level_1
newFtsI,170,5.0
newDdlB,169,4.970588
newMurA,102,3.0
newMurD,101,2.970588
newMraW,94,2.764706
newFtsW,68,2.0
newMraY,68,2.0
newFtsA,67,1.970588
newMurC,52,1.529412
newMurE,48,1.411765


In [15]:
set(cw_no_lysM_grouped.index) - set(cw_yes_lysM_grouped.index)

{'newMraZ'}

In [16]:
set(cw_yes_lysM_grouped.index) - set(cw_no_lysM_grouped.index)

set()

## eggNOG orthologs

In [17]:
eggNOG_annotations_path = strep_folder / 'Streptococcus_eggNOG_annotations.csv'

if not eggNOG_annotations_path.is_file():
    eggNOG_df = pd.read_csv(
        strep_folder / 'eggNOG' / 'Streptococcus.emapper.annotations',
        sep='\t',
        comment='#',
        header=None,
        names=[
            'protein_id', 'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 
            'COG_category', 'Description', 'Preferred_name', 'GOs', 'EC', 
            'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 
            'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs',
        ]
    )

    eggNOG_df['assembly_accession'] = eggNOG_df['protein_id'].apply(lambda v: v.split('@')[1])
    eggNOG_df = eggNOG_df.set_index('assembly_accession', drop=True).replace('-', np.nan)

    eggNOG_df.to_csv(eggNOG_annotations_path)

eggNOG_df = pd.read_csv(eggNOG_annotations_path, index_col='assembly_accession')
eggNOG_df.head()

Unnamed: 0_level_0,protein_id,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_003963555.1,WP_126467658.1@GCF_003963555.1,1000570.HMPREF9966_1759,9.704e-82,271.0,"COG0716@1|root,COG0716@2|Bacteria,1V45R@1239|F...",2|Bacteria,C,Flavodoxin,,,...,,,,,,,,,,Flavodoxin_4
GCF_003963555.1,WP_126467724.1@GCF_003963555.1,862970.SAIN_1577,1.3129999999999999e-186,582.0,"COG1396@1|root,COG1396@2|Bacteria,1VIH9@1239|F...",2|Bacteria,K,Helix-turn-helix XRE-family like proteins,,,...,,,,,,,,,,"HTH_19,HTH_3,TPR_12,TPR_8"
GCF_003963555.1,WP_126467790.1@GCF_003963555.1,862970.SAIN_1636,2.776e-131,418.0,"COG1564@1|root,COG1564@2|Bacteria,1VA0W@1239|F...",2|Bacteria,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,,...,ko:K00949,"ko00730,ko01100,map00730,map01100",,R00619,"RC00002,RC00017","ko00000,ko00001,ko01000",,,,"TPK_B1_binding,TPK_catalytic"
GCF_003963555.1,WP_126467857.1@GCF_003963555.1,176090.SSIN_0693,1.181e-201,627.0,"COG3677@1|root,COG3677@2|Bacteria,1V4D1@1239|F...",2|Bacteria,L,ISXO2-like transposase domain,,,...,,,,,,,,,,"DDE_Tnp_IS1595,Zn_Tnp_IS1595"
GCF_003963555.1,WP_126467993.1@GCF_003963555.1,862969.SCI_1925,1.812e-256,791.0,"COG0612@1|root,COG0612@2|Bacteria,1TPN6@1239|F...",2|Bacteria,S,Peptidase M16 inactive,ymfF,,...,,,,,,,,,,"Peptidase_M16,Peptidase_M16_C"


In [18]:
s_equi_yes_lysM_instance = sorted(accessions_yes_LysM)[0]
metadata_df.loc[s_equi_yes_lysM_instance, 'ncbi_organism_name']

'Streptococcus equi subsp. zooepidemicus ATCC 35246'

In [19]:
df = pfam_df.loc[s_equi_yes_lysM_instance]
s_equi_yes_lysM_protein_id = df[df['hmm_query'] == 'LysM']['id'].values[0]
s_equi_yes_lysM_protein_id

'WP_014622634.1@GCF_000219765.1'

In [20]:
r = eggNOG_df[eggNOG_df['protein_id'] == s_equi_yes_lysM_protein_id].iloc[0]

print(r['eggNOG_OGs'])

r

COG0860@1|root,COG1388@1|root,COG0860@2|Bacteria,COG1388@2|Bacteria,1VINV@1239|Firmicutes,4HPIN@91061|Bacilli


protein_id                           WP_014622634.1@GCF_000219765.1
seed_ortholog                          1123299.AUKZ01000026_gene846
evalue                                                          0.0
score                                                         786.0
eggNOG_OGs        COG0860@1|root,COG1388@1|root,COG0860@2|Bacter...
max_annot_lvl                                            2|Bacteria
COG_category                                                      M
Description                      N-acetylmuramoyl-L-alanine amidase
Preferred_name                                                 yocH
GOs                                                             NaN
EC                                         3.2.1.1,3.5.1.28,3.5.2.6
KEGG_ko           ko:K01176,ko:K01448,ko:K02067,ko:K06385,ko:K17...
KEGG_Pathway      ko00311,ko00500,ko01100,ko01130,ko01501,ko0150...
KEGG_Module               M00210,M00627,M00628,M00669,M00670,M00727
KEGG_Reaction                    R02108,R02112,R

In [21]:
s_iniae_acc = 'GCF_030732225.1'
df = pfam_df.loc[s_iniae_acc]
s_iniae_yes_lysM_protein_id = df[df['hmm_query'] == 'LysM']['id'].values[0]

eggNOG_df[eggNOG_df['protein_id'] == s_iniae_yes_lysM_protein_id].iloc[0]

r2 = eggNOG_df[eggNOG_df['protein_id'] == s_iniae_yes_lysM_protein_id].iloc[0]

print(r2['eggNOG_OGs'])

r2

COG1388@1|root,COG1388@2|Bacteria,1V61Q@1239|Firmicutes,4HHT0@91061|Bacilli


protein_id                           WP_016355818.1@GCF_030732225.1
seed_ortholog                                       1346.DQ08_01735
evalue                                                          0.0
score                                                         705.0
eggNOG_OGs        COG1388@1|root,COG1388@2|Bacteria,1V61Q@1239|F...
max_annot_lvl                                            2|Bacteria
COG_category                                                      M
Description                                     LysM domain protein
Preferred_name                                                  sip
GOs                                                             NaN
EC                                                              NaN
KEGG_ko                                                         NaN
KEGG_Pathway                                                    NaN
KEGG_Module                                                     NaN
KEGG_Reaction                                   

In [22]:
accessions_with_S_protein = sorted(set(eggNOG_df[eggNOG_df['eggNOG_OGs'].str.contains('4HHT0')].index))
strep_with_S_protein = metadata_df.loc[accessions_with_S_protein]

In [23]:
strep_with_S_protein[strep_with_S_protein['gtdb_species'] == 'Streptococcus equi']

Unnamed: 0_level_0,accession,ambiguous_bases,checkm2_completeness,checkm2_contamination,checkm2_model,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [24]:
og_metadata = pd.read_csv(
    strep_folder / 'eggNOG' / 'e5.og_annotations.tsv',
    sep='\t', 
    header=None,
    names=['og', 'og_level', 'description'],
).drop_duplicates('og').set_index('og')
og_metadata.head()

Unnamed: 0_level_0,og_level,description
og,Unnamed: 1_level_1,Unnamed: 2_level_1
4HBBX,L,Recombinase
1XVC3,K,transcriptional regulator
2TDXG,S,
2QWWR,S,
37HD9,S,


In [25]:
def display_ogs(ogs):
    for og in ogs:
        desc = og_metadata.loc[og, 'description']
        print(f'{og}: {desc}')

In [26]:
def get_unique_ogs(eggNOG_df, accessions, og_whitelist=None):
    index = eggNOG_df.index

    og_union = set()
    for acc in accessions:
        if acc not in index:
            print(f'Not in index: {acc}')
            continue

        genme_ogs = eggNOG_df.loc[[acc]].set_index('protein_id')

        ogs = set()
        for protein_id in genme_ogs.index:
            protein_ogs = genme_ogs.loc[protein_id, 'eggNOG_OGs']

            candidate_ogs = {
                og_with_tax.split('@')[0]
                for og_with_tax in protein_ogs.split(',')
            }

            if og_whitelist is not None:
                candidate_ogs = candidate_ogs & og_whitelist
                
            ogs |= candidate_ogs

        og_union = og_union.union(ogs)

    return og_union

In [27]:
def get_ogs_present_in_all(eggNOG_df, accessions, og_whitelist=None):
    index = eggNOG_df.index

    og_intersection = None
    for acc in accessions:
        if acc not in index:
            print(f'Not in index: {acc}')
            continue

        genme_ogs = eggNOG_df.loc[[acc]].set_index('protein_id')

        ogs = set()
        for protein_id in genme_ogs.index:
            protein_ogs = genme_ogs.loc[protein_id, 'eggNOG_OGs']

            candidate_ogs = {
                og_with_tax.split('@')[0]
                for og_with_tax in protein_ogs.split(',')
            }

            if og_whitelist is not None:
                candidate_ogs = candidate_ogs & og_whitelist
                
            ogs |= candidate_ogs

        if og_intersection is None:
            og_intersection = ogs
        else:
            og_intersection &= ogs

    return og_intersection

### S-protein

 (OG: `4HHT0`)

LysM-containing protein that is essential for immune evasion in Group A Streptococci. 

Refs: 
- [Wierzbicki et al., 2019](https://doi.org/10.1016/j.celrep.2019.11.001)
- [Burnier et al., 2024](https://doi.org/10.1101/2024.11.08.622053)

This S-protein isn't present in _S. equi_ (group C Strep). Some _S. equi_ strains do however contain OG `4HPIN` (N-acetylmuramoyl-L-alanine amidase) along with a LysM domain.

Intriguing... I wonder if the strains of _S. equi_ with a PGH have re-acquired a LysM-containing protein for the purpose of targeting other Streptococci?

Below is a list of the unique OGs of proteins with LysM in Streptococcus.

In [28]:
genomes_with_lysM = sorted(set(pfam_df[pfam_df['hmm_query'] == 'LysM'].index))
protein_ids_with_lysM = sorted(pfam_df[pfam_df['hmm_query'] == 'LysM']['id'].unique())

ogs_list_str = eggNOG_df[eggNOG_df['protein_id'].isin(protein_ids_with_lysM)]['eggNOG_OGs'].unique()
unique_ogs = sorted({
    og_with_taxa.split('@')[0]
    for og_list in ogs_list_str
    for og_with_taxa in og_list.split(',')
})
len(unique_ogs)

88

In [29]:
unique_cogs = [og for og in unique_ogs if og.startswith('COG')]
display_ogs(unique_cogs)

COG0741: lytic transglycosylase activity
COG0860: N-acetylmuramoyl-L-alanine amidase activity
COG1388: cell wall organization
COG1652: LysM domain
COG1705: Flagellar rod assembly protein muramidase FlgJ
COG2304: hyaluronan metabolic process
COG2340: Cysteine-rich secretory protein family
COG3170: Tfp pilus assembly protein FimV
COG3583: domain protein
COG3757: hydrolase, family 25
COG3942: CHAP domain
COG5632: N-acetylmuramoyl-L-alanine amidase activity


In [30]:
acc = metadata_df[
    (metadata_df['gtdb_species'] == 'Streptococcus pyogenes') &
    (metadata_df['gtdb_representative'] == 't')
].index[0]
df = pfam_df.loc[acc]
protein_ids = df[df['hmm_query'] == 'LysM']['id'].unique()
assert len(protein_ids) == 1
protein_id = protein_ids[0]

print(protein_id)
eggNOG_df[eggNOG_df['protein_id'] == protein_id]['eggNOG_OGs'].iloc[0]

WP_010921965.1@GCF_002055535.1


'COG1388@1|root,COG1388@2|Bacteria,1V61Q@1239|Firmicutes,4HHT0@91061|Bacilli'

In [31]:
accs = sorted(metadata_df[
    metadata_df['gtdb_species'].isin([
        'Streptococcus pyogenes',
        'Streptococcus dysgalactiae',
    ])
].index)
df = pfam_df.loc[accs]
protein_ids_with_lysM = sorted(df[df['hmm_query'] == 'LysM']['id'].unique())

eggNOG_df_subset = eggNOG_df[eggNOG_df['protein_id'].isin(protein_ids_with_lysM)].copy()

unique_ogs = get_ogs_present_in_all(eggNOG_df_subset, accs)

display_ogs(unique_ogs)

1V61Q: LysM domain protein
COG1388: cell wall organization
4HHT0: LysM domain protein


In [32]:
accessions_with_4HHT0 = sorted(set(eggNOG_df[eggNOG_df['eggNOG_OGs'].str.contains('4HHT0@')].index))

species_with_4HHT0 = pd.DataFrame(metadata_df.loc[accessions_with_4HHT0]['gtdb_species'].value_counts(), columns=['count'])
species_with_4HHT0['total'] = [len(metadata_df[metadata_df['gtdb_species'] == s]) for s in species_with_4HHT0.index]
species_with_4HHT0['percent'] = (100 * species_with_4HHT0['count'] / species_with_4HHT0['total']).round(0)
species_with_4HHT0.head(20)

Unnamed: 0_level_0,count,total,percent
gtdb_species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Streptococcus pyogenes,271,271,100.0
Streptococcus agalactiae,196,197,99.0
Streptococcus pneumoniae,156,158,99.0
Streptococcus suis,108,109,99.0
Streptococcus dysgalactiae,43,43,100.0
Streptococcus iniae,13,14,93.0
Streptococcus suis_W,13,13,100.0
Streptococcus gordonii,13,13,100.0
Streptococcus salivarius,11,11,100.0
Streptococcus thermophilus,9,88,10.0


Let's see what's different between _S. equi_ strains without LysM and other strains with this S-protein. We'll exclude the S. equi strains with LysM for now.

In [33]:
ogs_no_lysM = get_ogs_present_in_all(eggNOG_df, sorted(accessions_no_LysM))
len(ogs_no_lysM)

4092

In [34]:
accessions_s_iniae_s_dysgalactiae = sorted(set(metadata_df[
    (metadata_df['gtdb_species'] == 'Streptococcus iniae') |
    (metadata_df['gtdb_species'] == 'Streptococcus dysgalactiae')
].index))

# og_yes_lysM = get_ogs_present_in_all(eggNOG_df, accessions_s_iniae_s_dysgalactiae)
og_yes_lysM = get_ogs_present_in_all(eggNOG_df, accessions_yes_LysM)
len(og_yes_lysM)

5660

In [35]:
missing_in_yes_lysM = ogs_no_lysM - og_yes_lysM
print('Gain:', len(ogs_no_lysM - og_yes_lysM), 'OGs')
display_ogs(sorted(missing_in_yes_lysM))

Gain: 28 OGs
1M8ZB: ABC-type amino acid transport signal transduction systems, periplasmic component domain
1M8ZC: Membrane protein involved in the export of O-antigen and teichoic acid
1M91B: Flavin transferase that catalyzes the transfer of the FMN moiety of FAD and its covalent binding to the hydroxyl group of a threonine residue in a target flavoprotein
1M96V: Periplasmic binding protein
1M9GP: leucine-rich protein
1M9KM: Specifically methylates the pseudouridine at position 1915 (m3Psi1915) in 23S rRNA
1MA2F: Nicotinamide mononucleotide transporter
1TQUG: ABC transporter
1TR9C: Flavin transferase that catalyzes the transfer of the FMN moiety of FAD and its covalent binding to the hydroxyl group of a threonine residue in a target flavoprotein
1UIJ9: ABC transporter
1V21G: nicotinamide mononucleotide transporter
1V3JM: Specifically methylates the pseudouridine at position 1915 (m3Psi1915) in 23S rRNA
1V649: COG2508, regulator of polyketide synthase expression
1W2RT: nan
2DEGR: nan
2

In [36]:
missing_in_no_lysM = og_yes_lysM - ogs_no_lysM
print(len(missing_in_no_lysM))

1596


In [37]:
m = metadata_df.loc[sorted(accessions_no_LysM)]
acc = m[m['gtdb_representative'] == 't'].index[0]

df = eggNOG_df.loc[acc]
df[df['eggNOG_OGs'].str.contains('1M8ZC')]

Unnamed: 0_level_0,protein_id,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_900637675.1,VEH28842.1@GCA_900637675.1,40041.SZO_00080,6.156e-301,925.0,"COG2244@1|root,COG2244@2|Bacteria,1TNYX@1239|F...",2|Bacteria,S,Membrane protein involved in the export of O-a...,yabM,"GO:0005575,GO:0005623,GO:0005886,GO:0016020,GO...",...,"ko:K03328,ko:K06409",,,,,"ko00000,ko02000","2.A.66.2,2.A.66.2.14",,,"Polysacc_synt,Polysacc_synt_C"


In [38]:
df = eggNOG_df[eggNOG_df['eggNOG_OGs'].str.contains('1M8ZC')]

metadata_df.loc[sorted(set(df.index))]['gtdb_species'].value_counts()

gtdb_species
Streptococcus dysgalactiae    43
Streptococcus equi            37
Streptococcus halichoeri       1
Name: count, dtype: int64

## Focus on cell wall related pathways

In [39]:
kegg_pathways = [
    'map00550',  # Peptidoglycan biosynthesis
    'map00540',  # Lipopolysaccharide biosynthesis
]

COGs_only = False  # Focus on OGs with a COG ID (no internal eggNOG ones)

cw_ogs = set()
for kegg_pathway in kegg_pathways:
    ogs_str = eggNOG_df[
        eggNOG_df['KEGG_Pathway'].notnull() &
        eggNOG_df['KEGG_Pathway'].str.contains(kegg_pathway)
    ]['eggNOG_OGs'].values

    if COGs_only:
        cw_ogs |= {
            og
            for og_str in ogs_str
            for og_raw in og_str.split(',')
            if (og := og_raw.split('@')[0]).startswith('COG')
        }
    else:
        cw_ogs |= {
            og_raw.split('@')[0]
            for og_str in ogs_str
            for og_raw in og_str.split(',')
        }

len(cw_ogs)

461

In [40]:
cog_ogs = {
    og
    for og_str in eggNOG_df['eggNOG_OGs']
    for og_raw in og_str.split(',')
    if (og := og_raw.split('@')[0]).startswith('COG')
}

len(cog_ogs)

2190

In [41]:
# ogs_no_lysM = get_ogs_present_in_all(eggNOG_df, sorted(accessions_no_LysM), cw_ogs)
# ogs_no_lysM_all = get_unique_ogs(eggNOG_df, sorted(accessions_no_LysM), cw_ogs)

ogs_no_lysM = get_ogs_present_in_all(eggNOG_df, sorted(accessions_no_LysM), cog_ogs)
ogs_no_lysM_all = get_unique_ogs(eggNOG_df, sorted(accessions_no_LysM), cog_ogs)

len(ogs_no_lysM), len(ogs_no_lysM_all)

(860, 1230)

In [42]:
accessions = sorted(set(metadata_df[
    (metadata_df['gtdb_species'] == 'Streptococcus iniae') |
    (metadata_df['gtdb_species'] == 'Streptococcus pyogens') |
    (metadata_df['gtdb_species'] == 'Streptococcus dysgalactiae')
].index))

df = eggNOG_df.loc[accessions]
accessions = sorted(set(df[df['eggNOG_OGs'].str.contains('4HHT0')].index))

# og_yes_lysM = get_ogs_present_in_all(eggNOG_df, accessions, cw_ogs)
# og_yes_lysM_all = get_unique_ogs(eggNOG_df, accessions, cw_ogs)

og_yes_lysM = get_ogs_present_in_all(eggNOG_df, accessions, cog_ogs)
og_yes_lysM_all = get_unique_ogs(eggNOG_df, accessions, cog_ogs)

len(og_yes_lysM), len(og_yes_lysM_all)

(853, 1440)

In [43]:
missing_in_yes_lysM = ogs_no_lysM - og_yes_lysM_all
print('Gain:', len(missing_in_yes_lysM), 'OGs')
display_ogs(sorted(missing_in_yes_lysM))

Gain: 8 OGs
COG1345: cell adhesion
COG1376: peptidoglycan L,D-transpeptidase activity
COG1434: Gram-negative-bacterium-type cell wall biogenesis
COG3458: cephalosporin-C deacetylase activity
COG4495: Domain of unknown function (DUF4176)
COG4935: serine-type endopeptidase activity
COG5353: nan
COG5585: self proteolysis


In [44]:
m = metadata_df.loc[sorted(accessions_no_LysM)]
acc = m[m['gtdb_representative'] == 't'].index[0]
df = eggNOG_df.loc[acc]

protein_ids = set()
for og in sorted(missing_in_yes_lysM):
    protein_ids |= set(df[df['eggNOG_OGs'].str.contains(og)]['protein_id'].unique())

eggNOG_df[eggNOG_df['protein_id'].isin(sorted(protein_ids))]

Unnamed: 0_level_0,protein_id,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_900637675.1,VEH30075.1@GCA_900637675.1,40041.SZO_17040,9.956e-58,199.0,"COG3458@1|root,COG3458@2|Bacteria,1TS02@1239|F...",2|Bacteria,Q,Acetyl xylan esterase (AXE1),cah,,...,ko:K01060,"ko00311,ko01130,map00311,map01130",,R03062,"RC00020,RC00041","ko00000,ko00001,ko01000",,,,AXE1
GCA_900637675.1,VEH32169.1@GCA_900637675.1,40041.SZO_11360,3.874e-315,964.0,"COG4935@1|root,COG4935@2|Bacteria,1UI0J@1239|F...",2|Bacteria,O,CotH kinase protein,cotH,,...,,,,,,,,,,CotH
GCA_900637675.1,VEH31027.1@GCA_900637675.1,40041.SZO_13680,3.983e-162,510.0,"COG5585@1|root,COG5585@2|Bacteria",2|Bacteria,,,,,...,ko:K11044,,,,,"ko00000,ko01000,ko02042",,,,"ADPrib_exo_Tox,PT-TG,Phage_min_cap2"
GCA_900637675.1,VEH34322.1@GCA_900637675.1,40041.SZO_17320,1.009e-92,307.0,"COG1345@1|root,COG1345@2|Bacteria,1URWC@1239|F...",2|Bacteria,N,LXG domain of WXG superfamily,fliD,"GO:0001539,GO:0005575,GO:0005576,GO:0005623,GO...",...,ko:K02407,"ko02040,map02040",,,,"ko00000,ko00001,ko02035",,,,"Flagellin_IN,FliD_C,FliD_N"
GCA_900637675.1,VEH31778.1@GCA_900637675.1,40041.SZO_12260,1.951e-84,280.0,"COG5353@1|root,COG5353@2|Bacteria",2|Bacteria,S,protein conserved in bacteria,ypmB,,...,,,,,,,,,,PepSY
GCA_900637675.1,VEH29995.1@GCA_900637675.1,40041.SZO_17320,2.667e-267,833.0,"COG1345@1|root,COG1345@2|Bacteria,1URWC@1239|F...",2|Bacteria,N,LXG domain of WXG superfamily,fliD,"GO:0001539,GO:0005575,GO:0005576,GO:0005623,GO...",...,ko:K02407,"ko02040,map02040",,,,"ko00000,ko00001,ko02035",,,,"Flagellin_IN,FliD_C,FliD_N"
GCA_900637675.1,VEH33832.1@GCA_900637675.1,40041.SZO_08060,8.211e-65,220.0,"COG4495@1|root,COG4495@2|Bacteria,1VIE1@1239|F...",2|Bacteria,S,Domain of unknown function (DUF4176),,,...,,,,,,,,,,DUF4176
GCA_900637675.1,VEH29160.1@GCA_900637675.1,40041.SZO_00810,1.507e-57,199.0,"COG4495@1|root,COG4495@2|Bacteria,1VFG4@1239|F...",2|Bacteria,S,Domain of unknown function (DUF4176),,,...,,,,,,,,,,DUF4176
GCA_900637675.1,VEH34822.1@GCA_900637675.1,40041.SZO_04710,2.247e-96,315.0,"COG1376@1|root,COG1376@2|Bacteria,1V4KP@1239|F...",2|Bacteria,M,ErfK YbiS YcfS YnhG,yciB,,...,,,,,,,,,,YkuD
GCA_900637675.1,VEH34825.1@GCA_900637675.1,40041.SZO_04710,0.0001537,44.0,"COG1376@1|root,COG1376@2|Bacteria,1V4KP@1239|F...",2|Bacteria,M,ErfK YbiS YcfS YnhG,yciB,,...,,,,,,,,,,YkuD


In [45]:
missing_in_no_lysM = og_yes_lysM - ogs_no_lysM_all
print('Loss:', len(missing_in_no_lysM), 'OGs')
display_ogs(sorted(missing_in_no_lysM))

Loss: 31 OGs
COG0010: Belongs to the arginase family
COG0390: UDP-glucose transmembrane transporter activity
COG0411: L-isoleucine import across plasma membrane
COG0415: DNA photolyase activity
COG0559: Belongs to the binding-protein-dependent transport system permease family
COG0574: phosphotransferase activity, paired acceptors
COG0578: glycerol-3-phosphate dehydrogenase (quinone) activity
COG0625: glutathione transferase activity
COG0727: PFAM Uncharacterised protein family (UPF0153)
COG0730: response to heat
COG0813: purine-nucleoside phosphorylase activity
COG1054: Rhodanese Homology Domain
COG1228: amidohydrolase
COG1252: NADH oxidation
COG1271: oxidase subunit
COG1575: vitamin K biosynthetic process
COG1600: epoxyqueuosine reductase activity
COG1620: lactate transmembrane transporter activity
COG1765: OsmC-like protein
COG1806: Bifunctional serine threonine kinase and phosphorylase involved in the regulation of the
COG1937: Protein conserved in bacteria
COG2072: flavoprotein inv

In [46]:
acc = metadata_df[(metadata_df['gtdb_species'] == 'Streptococcus pyogenes') & (metadata_df['gtdb_representative'] == 't')].index[0]
df = eggNOG_df.loc[acc]

protein_ids = set()
for og in sorted(missing_in_no_lysM):
    protein_ids |= set(df[df['eggNOG_OGs'].str.contains(og)]['protein_id'].unique())

eggNOG_df[eggNOG_df['protein_id'].isin(sorted(protein_ids))]

Unnamed: 0_level_0,protein_id,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_002055535.1,WP_010922738.1@GCF_002055535.1,1314.HKU360_01888,6.300000000000001e-119,382.0,"COG3758@1|root,COG3758@2|Bacteria,1VH52@1239|F...",2|Bacteria,S,HutD,hutD,,...,ko:K09975,,,,,ko00000,,,,HutD
GCF_002055535.1,WP_015055933.1@GCF_002055535.1,1314.HKU360_00540,4.4529999999999997e-246,759.0,"COG1600@1|root,COG1600@2|Bacteria,1TP6Q@1239|F...",2|Bacteria,C,Catalyzes the conversion of epoxyqueuosine (oQ...,queG,"GO:0003674,GO:0003824,GO:0006139,GO:0006725,GO...",...,ko:K18979,,,,,"ko00000,ko01000,ko03016",,,,"DUF1730,Fer4_16,HEAT_2"
GCF_002055535.1,WP_010922741.1@GCF_002055535.1,1314.HKU360_01891,6.454e-220,681.0,"COG0010@1|root,COG0010@2|Bacteria,1TQR5@1239|F...",2|Bacteria,E,Catalyzes the conversion of N-formimidoyl-L-gl...,hutG,,...,ko:K01479,"ko00340,ko01100,map00340,map01100",M00045,R02285,"RC00221,RC00681","ko00000,ko00001,ko00002,ko01000",,,,Arginase
GCF_002055535.1,WP_029714211.1@GCF_002055535.1,1314.HKU360_01694,5.048e-183,572.0,"COG0730@1|root,COG0730@2|Bacteria,1V5K2@1239|F...",2|Bacteria,S,response to heat,XK27_00825,,...,ko:K07090,,,,,ko00000,,,,TauE
GCF_002055535.1,WP_010922563.1@GCF_002055535.1,1314.HKU360_01432,0.0,1206.0,"COG0578@1|root,COG0578@2|Bacteria,1TQJN@1239|F...",2|Bacteria,C,Belongs to the FAD-dependent glycerol-3-phosph...,glpD,"GO:0005575,GO:0005622,GO:0005623,GO:0005737,GO...",...,"ko:K00105,ko:K00111","ko00564,ko01110,map00564,map01110",,"R00846,R00848",RC00029,"ko00000,ko00001,ko01000",,,,"DAO,DAO_C"
GCF_002055535.1,WP_010922187.1@GCF_002055535.1,1314.HKU360_00730,3.2609999999999997e-226,699.0,"COG1054@1|root,COG1054@2|Bacteria,1TRG7@1239|F...",2|Bacteria,S,Belongs to the UPF0176 family,yceA,,...,ko:K07146,,,,,ko00000,,,,"Rhodanese,Rhodanese_C"
GCF_002055535.1,WP_029713996.1@GCF_002055535.1,1314.HKU360_00604,3.272e-198,617.0,"COG2321@1|root,COG2321@2|Bacteria,1TQKF@1239|F...",2|Bacteria,S,metalloprotease,ypfJ,"GO:0005575,GO:0005576",...,ko:K07054,,,,,ko00000,,,,Zn_peptidase
GCF_002055535.1,WP_002990147.1@GCF_002055535.1,1314.HKU360_00708,2.844e-151,477.0,"COG0813@1|root,COG0813@2|Bacteria,1TQPG@1239|F...",2|Bacteria,F,purine nucleoside phosphorylase,deoD,"GO:0003674,GO:0003824,GO:0004731,GO:0005488,GO...",...,"ko:K00772,ko:K03784","ko00230,ko00240,ko00270,ko00760,ko01100,ko0111...",M00034,"R01402,R01561,R01863,R01969,R02147,R02294,R022...","RC00033,RC00063,RC00122,RC02819","ko00000,ko00001,ko00002,ko01000",,,"iAPECO1_1312.APECO1_1997,iB21_1397.B21_04226,i...",PNP_UDP_1
GCF_002055535.1,WP_010922487.1@GCF_002055535.1,1314.HKU360_01278,1e-323,988.0,"COG0415@1|root,COG0415@2|Bacteria,1TQAK@1239|F...",2|Bacteria,L,Belongs to the DNA photolyase family,phr,,...,ko:K01669,,,,,"ko00000,ko01000,ko03400",,,,"DNA_photolyase,FAD_binding_7"
GCF_002055535.1,WP_010922735.1@GCF_002055535.1,1314.HKU360_01885,1.193e-197,615.0,"COG3643@1|root,COG3643@2|Bacteria,1TP5T@1239|F...",2|Bacteria,E,glutamate,ftcD,,...,ko:K00603,"ko00340,ko00670,ko01100,map00340,map00670,map0...",,"R02287,R03189","RC00165,RC00221,RC00223,RC00870","ko00000,ko00001,ko01000",,,,"FTCD,FTCD_N"


### YukD in S. equi with a LysM protein (but no S-protein)

In [47]:
df = eggNOG_df.loc[sorted(accessions_yes_LysM)]
set(df[df['eggNOG_OGs'].str.contains('COG1376')].index), accessions_yes_LysM


({'GCF_000219765.1', 'GCF_009676645.1', 'GCF_009676685.2', 'GCF_900636805.1'},
 {'GCF_000219765.1', 'GCF_009676645.1', 'GCF_009676685.2', 'GCF_900636805.1'})

They all have YukD – suggesting that their cell wall composition is the same as other S. equi without LysM proteins.

The LysM + Amidase they encode might be a weapon.

In [48]:
pfam_s_equi[pfam_s_equi['hmm_query'] == 'LysM']

Unnamed: 0_level_0,id,protein_id,hmm_accession,hmm_query,evalue,bitscore,accuracy,start,end,gtdb_species
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GCF_009676685.2,WP_014622634.1@GCF_009676685.2,WP_014622634.1,PF01476.25,LysM,5.4e-14,48.8,0.99,270,314,Streptococcus equi
GCF_900636805.1,WP_154803714.1@GCF_900636805.1,WP_154803714.1,PF01476.25,LysM,2.4e-15,53.1,0.98,269,313,Streptococcus equi
GCF_000219765.1,WP_014622634.1@GCF_000219765.1,WP_014622634.1,PF01476.25,LysM,5.3e-14,48.8,0.99,270,314,Streptococcus equi
GCF_009676645.1,WP_014622634.1@GCF_009676645.1,WP_014622634.1,PF01476.25,LysM,5.4e-14,48.8,0.99,270,314,Streptococcus equi
