## Analysis of CompareM results: generation of AAI and OF heatmaps for *Rosenbergiella* spp.

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import seaborn as sns

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/comparem/'):
    os.mkdir('/home/sergio/TFM1/reports/comparem/')

In [None]:
path = '/home/sergio/TFM1/comparem/' # path to the folder containing the outputs of CompareM

Rbg = 'Rosenbergiella'

mylist_Rbg = []

for foldername in os.listdir(path):
    if os.path.isdir(os.path.join(path, foldername)):
        for filename in os.listdir(os.path.join(path, foldername)):
               if filename == 'aai_summary.tsv':
                    with open(os.path.join(path, foldername, filename)) as f:
                        next(f)
                        for line in f:
                            genomeA, genes_in_A, genomeB, genes_in_B, orthologous_genes, mean_AAI, std_AAI, OF = line.split("\t")
                            data = [genomeA, genomeB, mean_AAI, OF]
                            
                            genusA = re.match("([A-Z]*[a-z]*).*", genomeA).groups()[0]
                            genusB = re.match("([A-Z]*[a-z]*).*", genomeB).groups()[0]
                            
                            if genusA == Rbg and genusB == Rbg:
                                data_Rbg = [genomeA, genomeB, mean_AAI, OF]
                                mylist_Rbg.append(data_Rbg)

In [None]:
genome_names_Rbg = []
for element in mylist_Rbg:
    genome_names_Rbg.append(element[0])
    genome_names_Rbg.append(element[1])

genome_names_Rbg = list(set(genome_names_Rbg))

In [None]:
aai_Rbg = pd.DataFrame(columns = genome_names_Rbg, index = genome_names_Rbg)
for element in mylist_Rbg:
    aai_Rbg[element[0]][element[1]] = np.float64(element[2])
    aai_Rbg[element[1]][element[0]] = np.float64(element[2])
for i in aai_Rbg.index:
    aai_Rbg.loc[i, i] = np.int64(100)

In [None]:
# To modify one of the genome names (to make it shorter) and reorder columns and rows by alphabetical order
aai_Rbg = aai_Rbg.rename(columns={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'},
               index={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'})

aai_Rbg.columns = aai_Rbg.columns.str.replace('_', ' ')
aai_Rbg.index = aai_Rbg.index.str.replace('_', ' ')

aai_Rbg = aai_Rbg[sorted(aai_Rbg.columns)].sort_index()
aai_Rbg

In [None]:
# To export the AAI matrix as a .tsv file
pd.DataFrame.to_csv(aai_Rbg, path_or_buf = '/home/sergio/TFM1/reports/comparem/aai_Rbg_matrix.tsv', sep = "\t", index = True)

In [None]:
# To generate the AAI heatmap
df1b = pd.read_csv('/home/sergio/TFM1/reports/comparem/aai_Rbg_matrix.tsv', sep='\t', index_col=0)
fig, ax = plt.subplots(figsize=(6, 6))
g=sns.heatmap(df1b, cmap="YlGnBu", square=True, ax=ax)
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
of_Rbg = pd.DataFrame(columns = genome_names_Rbg, index = genome_names_Rbg)
for element in mylist_Rbg:
    of_Rbg[element[0]][element[1]] = np.float64(element[3])
    of_Rbg[element[1]][element[0]] = np.float64(element[3])
for i in of_Rbg.index:
    of_Rbg.loc[i, i] = np.int64(100)

In [None]:
# To modify one of the genome names (to make it shorter) and reorder columns and rows by alphabetical order
of_Rbg = of_Rbg.rename(columns={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'},
               index={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'})

of_Rbg.columns = of_Rbg.columns.str.replace('_', ' ')
of_Rbg.index = of_Rbg.index.str.replace('_', ' ')

of_Rbg = of_Rbg[sorted(of_Rbg.columns)].sort_index()
of_Rbg = of_Rbg/100

In [None]:
# To export the OF matrix as a .tsv file
pd.DataFrame.to_csv(of_Rbg, path_or_buf = '/home/sergio/TFM1/reports/comparem/OF_Rbg_matrix.tsv', sep = "\t", index = True)

In [None]:
# To generate the OF heatmap
df2b = pd.read_csv('/home/sergio/TFM1/reports/comparem/OF_Rbg_matrix.tsv', sep='\t', index_col=0)
fig, ax = plt.subplots(figsize=(6, 6))
g2=sns.heatmap(df2b, cmap="YlGnBu", square=True, ax=ax)
g2.set_xticklabels(g2.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
# Definition of masks to display numerical values only in the upper triangle

mask1 = np.triu(aai_Rbg)
mask1 = np.round(mask1.astype(np.float64),1)
mask1 = mask1.astype("U")
mask1[mask1=='0.0']=['']
mask1[mask1=='100.0']=['100']

mask2 = np.triu(of_Rbg)
mask2 = np.round(mask2.astype(np.float64),2)
mask2 = mask2.astype("U")
mask2[mask2=='0.0']=['']
mask2[mask2=='1.0']=['1']

In [None]:
# To get the two heatmaps in the same figure:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,10))

g=sns.heatmap(df1b, cmap="YlGnBu", annot=mask1, fmt="s", square=True, cbar_kws={"label":"% AAI", "location":"top", "use_gridspec" : False}, ax=ax1)
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
g2=sns.heatmap(df2b, cmap="YlGnBu", annot=mask2, fmt="s", square=True, yticklabels=False, cbar_kws={"label":"OF", "location":"top", "use_gridspec" : False}, ax=ax2)
g2.set_xticklabels(g2.get_xticklabels(), rotation=45, horizontalalignment='right')

fig.savefig('/home/sergio/TFM1/reports/comparem/aai_of_Rbg_heatmap.pdf', bbox_inches='tight')