In [110]:
import os
from pathlib import Path
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import Phylo

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

from src.tree.itol_annotation import itol_labels, itol_color_strip_annotations, itol_binary_annotations
from src.tree.tree_util import prune_leaves_with_unknown_id
from src.utils.palette import palette_20

In [100]:
sns.set_palette('colorblind')
sns.set_style('whitegrid')
sns.set_context('paper', font_scale=1.8)
plt.rcParams['font.family'] = 'Helvetica'

palette = sns.color_palette().as_hex()

data_folder = Path('./data')
assert data_folder.is_dir()

figures_folder = Path('./figures')
assert figures_folder.is_dir()

# Haloarchaeal PGH tree

## Make Halo subtree from wider archaeal GTDB tree

In [101]:
gtdb_metadata = pd.read_csv(data_folder / 'gtdb_metadata.csv', index_col='ncbi_accession')

halo_genomes = gtdb_metadata[gtdb_metadata['gtdb_phylum'] == 'Halobacteriota'].copy()
halo_genome_accessions = set(halo_genomes.index)

print(f'Number of genomes in Halobacteriota: {len(halo_genomes):,}')
halo_genomes.head()

Number of genomes in Halobacteriota: 739


Unnamed: 0_level_0,accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
ncbi_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000224475.1,GB_GCA_000224475.1,0,97.28,1.71,417,f__Halobacteriaceae (UID96),263,0.0,3138251,86.140952,...,19,46,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Halolamina,Halolamina sp000224475
GCA_000306725.1,GB_GCA_000306725.1,0,99.84,0.0,228,p__Euryarchaeota (UID49),153,0.0,2638662,85.872449,...,19,53,0,Archaea,Halobacteriota,Methanosarcinia,Methanosarcinales,Methanosarcinaceae,Methanolobus,Methanolobus psychrophilus
GCA_000336895.1,GB_GCA_000336895.1,58,99.92,0.4,395,f__Halobacteriaceae (UID85),250,0.0,3601508,86.843865,...,19,66,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloarculaceae,Haloarcula,Haloarcula argentinensis
GCA_000337115.1,GB_GCA_000337115.1,127,98.8,0.0,417,f__Halobacteriaceae (UID96),263,0.0,3319737,85.224715,...,19,48,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Natrialbaceae,Natrinema,Natrinema thermotolerans
GCA_000415965.1,GB_GCA_000415965.1,0,91.51,0.38,417,f__Halobacteriaceae (UID96),263,100.0,2516031,72.393333,...,17,44,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Haloquadratum,Haloquadratum walsbyi_A


In [102]:
halo_genomes[halo_genomes['gtdb_genus'] == 'Halogranum']

Unnamed: 0_level_0,accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species
ncbi_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000283335.1,RS_GCF_000283335.1,0,98.61,0.76,417,f__Halobacteriaceae (UID96),263,0.0,3886827,86.521866,...,19,67,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Halogranum,Halogranum rubrum
GCF_900103715.1,RS_GCF_900103715.1,0,99.57,0.76,417,f__Halobacteriaceae (UID96),263,0.0,3302031,87.582685,...,19,53,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Halogranum,Halogranum gelatinilyticum
GCF_900110465.1,RS_GCF_900110465.1,0,99.38,1.77,417,f__Halobacteriaceae (UID96),263,0.0,4347805,83.842362,...,19,59,0,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haloferacaceae,Halogranum,Halogranum amylolyticum


In [108]:
pgh_df = pd.read_csv(data_folder / 'pgh_proteins.csv')

halo_pgh = pgh_df[
    pgh_df['gtdb_phylum'] == 'Halobacteriota'
].reset_index(drop=True).set_index('assembly_accession', drop=True)

halo_pgh_accessions = set(halo_pgh.index)

print(f'Number of PGH proteins in Halobacteriota: {len(halo_pgh):,}')
halo_pgh.head()

Number of PGH proteins in Halobacteriota: 59


Unnamed: 0_level_0,domain,gtdb_phylum,gtdb_class,gtdb_order,gtdb_family,gtdb_genus,gtdb_species,ncbi_organism_name,protein_id,pgh_architecture
assembly_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049969648.1,PG_binding_1+Amidase_2
GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049969681.1,SH3_3+NLPC_P60
GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_049971528.1,PG_binding_1+Peptidase_M23
GCF_000710615.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus cibarius,Haladaptatus cibarius D43,WP_158413663.1,PG_binding_1+Glucosaminidase
GCF_900156425.1,Archaea,Halobacteriota,Halobacteria,Halobacteriales,Haladaptataceae,Haladaptatus,Haladaptatus litoreus,Haladaptatus litoreus,WP_076427578.1,PG_binding_1+Peptidase_M23


In [104]:
archaeal_tree = Phylo.read(data_folder / 'ar53.tree', 'phyloxml')

for leaf in archaeal_tree.get_terminals():
    accession = leaf.name[3:]  # Remove prefix GB_ or RS_
    leaf.name = accession

halo_tree = prune_leaves_with_unknown_id(archaeal_tree, halo_genome_accessions)
halo_tree_accessions = sorted([l.name for l in halo_tree.get_terminals()])

assert len(set(halo_genome_accessions) - set(halo_tree_accessions)) == 0

Phylo.write([halo_tree], data_folder / 'figureS1' / 'halo.tree', 'phyloxml');

## Make annotations

## Leaf names

In [105]:
labels = []
for accession in halo_genome_accessions:
    if accession == 'GCF_000283335.1':
        name = 'Halogranum salarium B-1'
    else:
        name = halo_genomes.loc[accession, 'gtdb_species']

    labels.append([accession, name])

itol_labels(labels, data_folder / 'figureS1' / 'halo_labels.txt')

## GTDB class colour strip

In [106]:
classes = halo_genomes['gtdb_class'].unique().tolist()
classes

['Halobacteria',
 'Methanosarcinia',
 'Archaeoglobi',
 'Methanomicrobia',
 'Syntropharchaeia',
 'Methanonatronarchaeia',
 'Methanosarcinia_A',
 'Methanocellia',
 'UBA148',
 'Bog-38',
 'Methanoliparia']

In [107]:
class_data = []
for accession in halo_genome_accessions:
    gtdb_class = halo_genomes.loc[accession, 'gtdb_class']
    class_index = classes.index(gtdb_class)
    colour = palette_20[class_index] 
    class_data.append([accession, colour, gtdb_class])

itol_color_strip_annotations(
    data=class_data,
    output_path=data_folder / 'figureS1' / 'halo_class.txt',
    dataset_label='GTDB class',
    legend_title='GTDB class',
    color_branches=True,
)

## Annotate genomes with PGH

In [111]:
pgh_data = []
for accession in halo_genome_accessions:
    if accession in halo_pgh_accessions:
        label = '1'
    else:
        label = '-1'

    pgh_data.append([accession, label])

itol_binary_annotations(
    data=pgh_data,
    output_path=data_folder / 'figureS1' / 'halo_pgh.txt',
    field_shapes=[1],
    field_labels=['PGH'],
    dataset_label='PGH proteins',
    field_colors=[palette_20[12]],
    legend_title='PGH proteins',
)