# Streptococcus tree

- Genomes: all Streptococci complete genomes from GTBD r220
- Tree: built with [GTDB-Tk de-novo](https://ecogenomics.github.io/GTDBTk/commands/de_novo_wf.html) pipeline.

Below is code to annotate the tree with the preence or absence of different cell wall binding domains.

In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from Bio import Phylo

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

from src.cell_wall_binding_domains import cwb_domains
from src.tree.tree_util import prune_leaves_with_unknown_id
from src.tree.itol_annotation import itol_labels, itol_colored_ranges, itol_binary_annotations, hex_to_rgba

In [3]:
sns.set_theme(palette='colorblind', font_scale=1.3)
palette_colorblind = sns.color_palette('colorblind').as_hex()
palette_pastel = sns.color_palette('pastel').as_hex()

data_folder = Path('./data/')
assert data_folder.is_dir()

db_proka = Path('../db_proka/')
assert db_proka.is_dir()

gtdb_folder = Path('../data/gtdb_r220/')
assert gtdb_folder.is_dir()

strep_folder = gtdb_folder / 'Streptococcus'
assert strep_folder.is_dir()

## Load metadata

In [4]:
metadata_df = pd.read_csv(strep_folder / 'genomes_metadata.csv', index_col='assembly_accession')
len(metadata_df)

1228

## Load & prune tree

In [5]:
raw_tree = Phylo.read(strep_folder / 'GTDB-Tk_de_novo' / 'Streptococcus.bac120.decorated.tree', 'newick')

strep_ids = set()
for leaf in raw_tree.get_terminals():
    if leaf.name.startswith('GC'):
        strep_id = '_'.join(leaf.name.split('_')[:2])
        leaf.name = strep_id
        strep_ids.add(strep_id)

assert len(strep_ids) == len(metadata_df)

tree = prune_leaves_with_unknown_id(raw_tree, strep_ids)

assert len([l for l in tree.get_terminals()]) == len(metadata_df)

def label_common_ancestor(tree, leaf_ids, label):
    leaf_clades = [tree.find_any(name) for name in leaf_ids]
    common_ancestor = tree.common_ancestor(leaf_clades)
    common_ancestor.name = label

top_10_strep_species = metadata_df['gtdb_species'].value_counts()[:10].index.tolist()
for species in top_10_strep_species:
    ids = metadata_df[metadata_df['gtdb_species'] == species].index.tolist()
    label_common_ancestor(tree, ids, f's__{species}')

strep_tree_path = strep_folder / 'tree' / 'Streptococcus.tree'
with strep_tree_path.open('w') as f_out:
    Phylo.write([tree], f_out, 'phyloxml')

tree = Phylo.read(strep_tree_path, 'phyloxml')

## Annotate

### Labels

In [6]:
labels = []
for accession in metadata_df.index:
    ncbi_organism_name = metadata_df.loc[accession, 'ncbi_organism_name']
    label = f'{ncbi_organism_name} [{accession}]'
    labels.append([accession, label])

itol_labels(
    labels, 
    strep_folder / 'tree' / 'labels.txt'
)

### Colored ranges

Covering the main main Streptocci species.

In [7]:
strep_species_colored_ranges = []

for species in top_10_strep_species:
    color_index = top_10_strep_species.index(species)
    color = palette_pastel[color_index]

    node_id = f's__{species}'
    strep_species_colored_ranges.append(
        [node_id, node_id, hex_to_rgba(color, 0.25), '', '', '', '', species, '', '30', 'bold-italic']
    )

itol_colored_ranges(
    strep_species_colored_ranges,
    output_path=strep_folder / 'tree' / 'species_colored_range.txt',
    range_type='box',
    range_cover='tree',
    dataset_label='Species',
)

### Binary annotations: presence or absence of CWB domains

In [8]:
cwb_bins = pd.read_csv(strep_folder / 'Streptococcus_cell_wall_binding.csv', index_col='assembly_accession')
cwb_bins.head()

Unnamed: 0_level_0,gtdb_species,PG_binding_1,PG_binding_2,PG_binding_3,AMIN,SPOR,SH3_1,SH3_2,SH3_3,SH3_4,...,Choline_bind_1,Choline_bind_2,Choline_bind_3,CW_binding_2,CW_7,PSA_CBD,ZoocinA_TRD,GW,OapA,WxL
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000013525.1,Streptococcus pyogenes,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCA_000014305.1,Streptococcus suis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCA_000014325.1,Streptococcus suis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCA_000188715.1,Streptococcus dysgalactiae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCA_000211095.1,Streptococcus pneumoniae,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0


In [9]:
cwb_present_in_strep_with_count = []
for cwb in cwb_domains:
    count = cwb_bins[cwb].sum()
    if count > 0:
        cwb_present_in_strep_with_count.append((cwb, int(count)))

cwb_present_in_strep = [cwb for cwb, _ in sorted(cwb_present_in_strep_with_count, key=lambda t: t[1], reverse=True)]
cwb_present_in_strep

['LysM',
 'SH3_5',
 'Choline_bind_3',
 'Choline_bind_1',
 'Choline_bind_2',
 'ZoocinA_TRD',
 'CW_7',
 'GW',
 'SH3_3']

In [10]:
cwb_to_name = {
    'LysM': 'Lysine Motif (LysM)',
    'SH3_5': 'Src Homology 3 (SH3)',
    'Choline_bind_3': 'Choline binding domain',
    'Choline_bind_1': 'Choline binding domain',
    'Choline_bind_2': 'Choline binding domain',
    'ZoocinA_TRD': 'Zoocin A target recognition domain',
    'CW_7' : 'Cell wall binding motif 7 (CW_7)',
    'GW': 'GlyTryp dipeptide domain (GW)',
    'SH3_3': 'Src Homology 3 (SH3)',
}
name_to_cwb = {}
for cwb in sorted(cwb_to_name.keys()):
    name = cwb_to_name[cwb]
    if name not in name_to_cwb:
        name_to_cwb[name] = []

    name_to_cwb[name].append(cwb)

In [11]:
final_cwb_list = []
for cwb in cwb_present_in_strep:
    name = cwb_to_name[cwb]
    if name not in final_cwb_list:
        final_cwb_list.append(name)

final_cwb_list

['Lysine Motif (LysM)',
 'Src Homology 3 (SH3)',
 'Choline binding domain',
 'Zoocin A target recognition domain',
 'Cell wall binding motif 7 (CW_7)',
 'GlyTryp dipeptide domain (GW)']

In [12]:
cwb_binary_presence_data = []
for accession in sorted(cwb_bins.index):
    cwb_list = []
    for cwb_label in final_cwb_list:
        is_present = False
        for cwb in name_to_cwb[cwb_label]:
            count = cwb_bins.loc[accession, cwb]
            if count > 0:
                is_present = True

        cwb_list.append('1' if is_present else '-1')

    cwb_binary_presence_data.append([accession] + cwb_list)

n_domains = len(final_cwb_list)
itol_binary_annotations(
    data=cwb_binary_presence_data,
    output_path=strep_folder / 'tree' / 'cwb_binary_presence.txt',
    field_shapes=[1] * n_domains,
    field_labels=final_cwb_list,
    dataset_label='CWB',
    field_colors=[palette_colorblind[i] for i in range(n_domains)],
    legend_title='Cell wall binding domains',
    height_factor=10,
)