## Analysis of GGDC results: generation of an *is*DDH heatmap for *Rosenbergiella* spp.

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')
if not os.path.exists('/home/sergio/TFM1/reports/ggdc/'):
    os.mkdir('/home/sergio/TFM1/reports/ggdc/')

In [None]:
# Generation of a data frame from the isDDH results

path = '/home/sergio/TFM1/GGDC/' # path with the outputs of the Genome-to-Genome Distance Calculator (GGDC)

column_names = ['Index','Query genome','Reference genome',
                'DDH 1','Model C.I. 1','Distance 1','Prob. DDH >= 70% 1',
                'DDH 2','Model C.I. 2','Distance 2','Prob. DDH >= 70% 2',
                'DDH 3','Model C.I. 3','Distance 3','Prob. DDH >= 70% 3','G+C difference']

mydict = {}
genome_names = []

n = 0       
for filename in os.listdir(path):
    if re.match("(.+).csv", filename):
        with open(os.path.join(path,filename)) as myFile:
            for num, line in enumerate(myFile, 1):
                if num > 2:
                    n +=1
                    query_genome,ref_genome,DDH1,CI1,Dist1,ProbDDH1,DDH2,CI2,Dist2,ProbDDH2,DDH3,CI3,Dist3,ProbDDH3,GC_difference = line.split(',')
                    query_genome = re.match('(.+).fa',query_genome).groups()[0]
                    ref_genome = re.match('(.+).fa',ref_genome).groups()[0]
                    GC_difference = re.match('(.+)\n',GC_difference).groups()[0]
                    newdict = {'Query genome':query_genome,'Reference genome':ref_genome,
                        'DDH 1':DDH1,'Model C.I. 1':CI1,'Distance 1':Dist1,'Prob. DDH >= 70% 1':ProbDDH1,
                        'DDH 2':DDH2,'Model C.I. 2':CI2,'Distance 2':Dist2,'Prob. DDH >= 70% 2':ProbDDH2,
                        'DDH 3':DDH3,'Model C.I. 3':CI3,'Distance 3':Dist3,'Prob. DDH >= 70% 3':ProbDDH3,
                        'G+C difference':GC_difference}
                    mydict[n]=newdict
                    genome_names.append(query_genome)
        myFile.close()

genome_names = list(set(genome_names))
df = pd.DataFrame.from_dict(mydict).T
df

In [None]:
# To keep only the results obtained with the second formula (which is the recommended one: http://ggdc.dsmz.de/faq.php#qggdc15)
target_columns = ['Query genome','Reference genome','DDH 2']
df2 = df.filter(items=target_columns)
df2

In [None]:
isDDH = pd.DataFrame(columns = genome_names, index = genome_names)
newlist = df2.to_numpy().tolist()
for element in newlist:
    isDDH[element[0]][element[1]] = np.float64(element[2])
for i in isDDH.index:
    isDDH.loc[i, i] = np.int64(100)

# To modify one of the genome names (to make it shorter) and reorder columns and rows by alphabetical order
isDDH = isDDH.rename(columns={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'},
               index={'Rosenbergiella_nectarea_IMG-taxon_2651870171_annotated_assembly': 'Rosenbergiella_nectarea_8N4'})
isDDH.columns = isDDH.columns.str.replace('_', ' ')
isDDH.index = isDDH.index.str.replace('_', ' ')

# To export the isDDH matrix as a .tsv file
isDDH = isDDH[sorted(isDDH.columns)].sort_index()
pd.DataFrame.to_csv(isDDH, path_or_buf = '/home/sergio/TFM1/reports/ggdc/isDDH.tsv', sep = "\t", index = True)
isDDH

In [None]:
# Definition of a mask to display numerical values only in the upper triangle
mask = np.triu(isDDH)
mask = mask.astype("U")
mask[mask=='0']=['']

In [None]:
# To generate the isDDH heatmap
df3 = pd.read_csv('/home/sergio/TFM1/reports/ggdc/isDDH.tsv', sep='\t', index_col=0)
fig, ax = plt.subplots(figsize=(6, 6))
g=sns.heatmap(df3, cmap="YlGnBu", annot=mask, fmt="s", square=True, cbar_kws={"label":"% DDH", "location":"top", "use_gridspec":False, "shrink":0.75}, ax=ax)
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
fig.savefig('/home/sergio/TFM1/reports/ggdc/isDDH_heatmap.pdf', bbox_inches='tight')