## Analysis of pyani results: generation of ANI and TETRA heatmaps for *Rosenbergiella* spp.


Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/pyani'):
    os.mkdir('/home/sergio/TFM1/reports/pyani')

In [None]:
# Creation of a data frame from the ANI matrix
# Note that such matrix contains 106 rows and 106 columns (one per genome analyzed)

path1 = '/home/sergio/TFM1/pyani' # path with the outputs of pyani (ANI matrix)

for file in os.listdir(path1):
    if file == 'ANIm_percentage_identity.tab':
        df1 = pd.read_csv(os.path.join(path1,file), sep='\t', index_col=0)

# To express all values as percentages with two decimal positions      
df1 = df1 * 100
df1 = df1.round(2)
df1

In [None]:
# Creation of a data frame from the TETRA matrix

path2 = '/home/sergio/TFM1/pyani_tetra' # path with the outputs of pyani (TETRA matrix)

for file in os.listdir(path2):
    if file == 'TETRA_correlations.tab':
        df2 = pd.read_csv(os.path.join(path2,file), sep='\t', index_col=0)

# To express all values with two decimal positions      
df2 = df2.round(2)
df2

In [None]:
# To retain the columns and rows corresponding to Rosenbergiella spp.
df1 = df1.filter(regex='Rosenbergiella', axis=1).filter(regex='Rosenbergiella', axis=0)

# To modify one of the genome names (to make it shorter) and reorder columns and rows by alphabetical order
df1 = df1.rename(columns={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'},
               index={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'})

df1.columns = df1.columns.str.replace('_', ' ')
df1.index = df1.index.str.replace('_', ' ')

df1 = df1[sorted(df1.columns)].sort_index()

pd.DataFrame.to_csv(df1, path_or_buf = '/home/sergio/TFM1/reports/pyani/ani_Rbg_matrix.tsv', sep = "\t", index = True)

In [None]:
# Same operations as above for the TETRA data frame
df2 = df2.filter(regex='Rosenbergiella', axis=1).filter(regex='Rosenbergiella', axis=0)

df2 = df2.rename(columns={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'},
               index={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'})

df2.columns = df2.columns.str.replace('_', ' ')
df2.index = df2.index.str.replace('_', ' ')

df2 = df2[sorted(df2.columns)].sort_index()

pd.DataFrame.to_csv(df2, path_or_buf = '/home/sergio/TFM1/reports/pyani/TETRA_Rbg_matrix.tsv', sep = "\t", index = True)

In [None]:
# Definition of masks to display numerical values only in the upper triangle

mask1 = np.triu(df1)
mask1 = np.round(mask1.astype(np.float64),1)
mask1 = mask1.astype("U")
mask1[mask1=='0.0']=['']
mask1[mask1=='100.0']=['100']

mask2 = np.triu(df2)
mask2 = np.round(mask2.astype(np.float64),2)
mask2 = mask2.astype("U")
mask2[mask2=='0.0']=['']
mask2[mask2=='1.0']=['1']

In [None]:
# To get the ANI and TETRA heatmaps in the same figure:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,10))

g=sns.heatmap(df1, cmap="YlGnBu", annot=mask1, fmt="s", square=True, cbar_kws={"label":"% ANI", "location":"top", "use_gridspec" : False}, ax=ax1)
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
g2=sns.heatmap(df2, cmap="YlGnBu", annot=mask2, fmt="s", square=True, yticklabels=False, cbar_kws={"label":"TETRA", "location":"top", "use_gridspec" : False}, ax=ax2)
g2.set_xticklabels(g2.get_xticklabels(), rotation=45, horizontalalignment='right')

fig.savefig('/home/sergio/TFM1/reports/pyani/ani_TETRA_Rbg_heatmap.pdf', bbox_inches='tight')