# Figure 2 plots

In [6]:
import os
from pathlib import Path
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

In [7]:
sns.set_palette('colorblind')
sns.set_style('whitegrid')
sns.set_context('paper', font_scale=1.8)
plt.rcParams['font.family'] = 'Helvetica'

palette = sns.color_palette().as_hex()

data_folder = Path('./data')
assert data_folder.is_dir()

figures_folder = Path('./figures')
assert figures_folder.is_dir()

## Peptidase M23 homology tree

`Peptidase_M23` trees based off similarity to _Halogranum salarium_'s domain.

There are over 40,000 hits in bacteria for homology to `Peptidase_M23`. We are focusing on the 1,500 hits closest to archaea, and in particular _H. salarium_.

File `data/putative_targets.csv` from whoch the data is derived is generated from `src/run_target_search_sequence.sh`

In [9]:
target_hits = pd.read_csv(data_folder / 'putative_targets.csv', index_col='query')
target_hits.head()

Unnamed: 0_level_0,target,evalue,bitscore,target_start,target_end,query_accession,query_protein_id,target_accession,target_domain,pfam_domain,...,gtdb_family,gtdb_genus,gtdb_species,ncbi_organism_name,pfam_domains,tigr_domains,biomes,is_self,self_score,normalized_score
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_001412335.1@KQC03133.1$PG_binding_3:75-146,KQC03133.1@GCA_001412335.1,5.7e-43,147.7,74,146,GCA_001412335.1,KQC03133.1,GCA_001412335.1,Archaea,PG_binding_3,...,Methanoregulaceae,SD8,SD8 sp001412335,Methanolinea sp. SDB,Glyco_hydro_108+PG_binding_3,,,True,147.7,100.0
GCA_001412335.1@KQC03133.1$PG_binding_3:75-146,JAEZHY010000039.1_63@GCA_023416335.1,3.7000000000000003e-17,66.2,87,158,GCA_001412335.1,KQC03133.1,GCA_023416335.1,Bacteria,PG_binding_3,...,Syntrophobacteraceae,SbD1,SbD1 sp023416335,Proteobacteria bacterium,Glyco_hydro_108+PG_binding_3,,,False,147.7,44.82
GCA_001412335.1@KQC03133.1$PG_binding_3:75-146,SPD73798.1@GCA_900258555.1,2.3e-16,63.6,87,157,GCA_001412335.1,KQC03133.1,GCA_900258555.1,Bacteria,PG_binding_3,...,Desulfatiglandaceae,UBA5623,UBA5623 sp900258555,uncultured Desulfobacterium sp.,Glyco_hydro_108+PG_binding_3,,,False,147.7,43.06
GCA_001412335.1@KQC03133.1$PG_binding_3:75-146,DIKH01000035.1_16@GCA_002424495.1,2.4e-16,63.6,87,157,GCA_001412335.1,KQC03133.1,GCA_002424495.1,Bacteria,PG_binding_3,...,Desulfatiglandaceae,UBA5623,UBA5623 sp002424495,Desulfobacteraceae bacterium UBA5623,Glyco_hydro_108+PG_binding_3,,,False,147.7,43.06
GCA_001412335.1@KQC03133.1$PG_binding_3:75-146,MCE5336962.1@GCA_021372055.1,3.1e-16,63.3,89,161,GCA_001412335.1,KQC03133.1,GCA_021372055.1,Bacteria,PG_binding_3,...,Syntrophobacteraceae,SbD1,SbD1 sp021372055,Desulfobacteraceae bacterium,Glyco_hydro_108+PG_binding_3,,,False,147.7,42.86


In [20]:
h_salarium_accession = 'GCF_000283335.1'

In [21]:
h_salarium_queries = sorted(set(target_hits[
    target_hits['query_accession'] == h_salarium_accession
].index))
h_salarium_queries

['GCF_000283335.1@WP_009365375.1$Glucosaminidase:67-181',
 'GCF_000283335.1@WP_009365375.1$PG_binding_1:203-260',
 'GCF_000283335.1@WP_009365375.1$PG_binding_1:273-327',
 'GCF_000283335.1@WP_157572895.1$PG_binding_1:197-254',
 'GCF_000283335.1@WP_157572895.1$PG_binding_1:270-321',
 'GCF_000283335.1@WP_157572895.1$Peptidase_M23:59-156']

In [26]:
h_salarium_m23_hits = target_hits.loc['GCF_000283335.1@WP_157572895.1$Peptidase_M23:59-156']
h_salarium_m23_hits = h_salarium_m23_hits[~h_salarium_m23_hits['target'].str.startswith('MGYP')].copy()

top_targets_m23 = h_salarium_m23_hits.iloc[:1500].copy()
top_targets_m23_targets = set(top_targets_m23['target'].unique())

to_save_m23 = []
for record in SeqIO.parse(data_folder / 'pgh_db.fasta', 'fasta'):
    if record.id in top_targets_m23_targets:
        to_save_m23.append(record)

with (data_folder / 'figure2' / 'peptidase_M23.fasta').open('w') as f_out:
    print(SeqIO.write(to_save_m23, f_out, 'fasta'))

1500


A tree is created from these sequences using script `src/tree/make_tree.sh`.

Tree is available at `data/figure2/peptidase_M23.treefile`.

The tree is displayed and annotated with software [iTOL](https://itol.embl.de).

The final tree is a zoomed in version of the full tree: `figures/figure2/Peptidase_M23_tree.pdf`.

### Archaea with Peptidase M23 domain

In [30]:
pgh_df = pd.read_csv(data_folder / 'pgh_proteins.csv')

m23_archaea = pgh_df[
    (pgh_df['domain'] == 'Archaea') &
    (pgh_df['pgh_architecture'].str.contains('Peptidase_M23'))
]
print(f'Number of archaeal strains with Peptidase M23 proteins: {len(m23_archaea.assembly_accession.unique()):,}')
m23_archaea

Number of archaeal strains with Peptidase M23 proteins: 10


Unnamed: 0,assembly_accession,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,ncbi_organism_name,protein_id,pgh_architecture
16,GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049971528.1,PG_binding_1+Peptidase_M23
18,GCF_900156425.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus litoreus,Haladaptatus litoreus,WP_076427578.1,PG_binding_1+Peptidase_M23
29,GCF_020618475.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus sp020618475,Haladaptatus sp. PSR5,WP_255766967.1,PG_binding_1+Peptidase_M23
30,GCF_021083305.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus sp021083305,Haladaptatus sp. DYF46,WP_231184600.1,PG_binding_1+Peptidase_M23
52,GCF_000283335.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Halogranum,Halogranum rubrum,Halogranum salarium B-1,WP_157572895.1,PG_binding_1+Peptidase_M23
129,GCA_023144115.1,Archaea,Nanoarchaeota,Nanoarchaeia,Woesearchaeales,JAGLGH01,JAGLGH01,JAGLGH01 sp023144115,Nanoarchaeota archaeon,MCK5630884.1,LysM+Peptidase_M23
131,GCA_016211165.1,Archaea,Nanoarchaeota,Nanoarchaeia,Woesearchaeales,JAGWAF01,JACQSU01,JACQSU01 sp016211165,Candidatus Woesearchaeota archaeon,MBI4449356.1,PG_binding_1+Peptidase_M23
134,GCA_007117065.1,Archaea,Nanoarchaeota,Nanoarchaeia,Woesearchaeales,SKIA01,SKIA01,SKIA01 sp007117065,Candidatus Woesearchaeota archaeon,SKHI01000019.1_5,PG_binding_3+Glyco_hydro_108+Peptidase_M23
136,GCA_002499185.1,Archaea,Nanoarchaeota,Nanoarchaeia,Pacearchaeales,UBA284,UBA284,UBA284 sp002499185,Candidatus Pacearchaeota archaeon UBA284,DAKN01000010.1_36,LysM+Peptidase_M23
148,GCA_002254765.1,Archaea,Thermoplasmatota,EX4484-6,EX4484-6,EX4484-36,EX4484-36,EX4484-36 sp002254765,Thermoplasmatales archaeon ex4484_36,OYT49404.1,LysM+Peptidase_M23


### Archaea with PGB1 domain

In [33]:
pgh_df = pd.read_csv(data_folder / 'pgh_proteins.csv')

pgb1_archaea = pgh_df[
    (pgh_df['domain'] == 'Archaea') &
    (pgh_df['pgh_architecture'].str.contains('PG_binding_1'))
]
print(f'Number of archaeal strains with Peptidase M23 proteins: {len(pgb1_archaea.assembly_accession.unique()):,}')
pgb1_archaea

Number of archaeal strains with Peptidase M23 proteins: 24


Unnamed: 0,assembly_accession,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,ncbi_organism_name,protein_id,pgh_architecture
5,GCA_014729705.1,Archaea,Altiarchaeota,Altiarchaeia,IMC4,SCGC-AAA252-I15,WJJK01,WJJK01 sp014729705,Candidatus Altiarchaeales archaeon,MBD3260201.1,PG_binding_1+Glyco_hydro_25
12,GCA_016840645.1,Archaea,Asgardarchaeota,Sifarchaeia,Borrarchaeales,Borrarchaeaceae,Borrarchaeum,Borrarchaeum sp016840645,Candidatus Borrarchaeota archaeon,JAEORY010000361.1_1,PG_binding_1+Amidase_2
14,GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049969648.1,PG_binding_1+Amidase_2
16,GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049971528.1,PG_binding_1+Peptidase_M23
17,GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_158413663.1,PG_binding_1+Glucosaminidase
18,GCF_900156425.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus litoreus,Haladaptatus litoreus,WP_076427578.1,PG_binding_1+Peptidase_M23
20,GCF_900156425.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus litoreus,Haladaptatus litoreus,WP_076430381.1,PG_binding_1+Amidase_2
21,GCF_900156425.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus litoreus,Haladaptatus litoreus,WP_175609653.1,PG_binding_1+Glucosaminidase
23,GCF_020700235.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus pallidirubidus,Haladaptatus pallidirubidus,WP_227776313.1,PG_binding_1+Amidase_2
24,GCF_020700235.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus pallidirubidus,Haladaptatus pallidirubidus,WP_265338400.1,PG_binding_1+Glucosaminidase


In [37]:
pgb1_archaea[['gtdb_phylum', 'assembly_accession', 'protein_id']].groupby('gtdb_phylum').nunique().sort_values('assembly_accession', ascending=False)

Unnamed: 0_level_0,assembly_accession,protein_id
gtdb_phylum,Unnamed: 1_level_1,Unnamed: 2_level_1
Halobacteriota,12,18
Nanoarchaeota,4,8
Thermoplasmatota,3,3
Thermoproteota,3,3
Altiarchaeota,1,1
Asgardarchaeota,1,1
