## Make Taxonomy Heatmap

In [19]:
from IPython.core.display import display
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
from os.path import abspath,join
taxonomy_folder = abspath("../output/taxonomy_summary/")
output_folder = abspath('../output/')



First let's define a function to make and output a heatmap graphic. We'll then run several versions for each taxonomic level.

In [20]:
def make_taxonomy_heatmap(input_path,output_path,raw_data_path, log2_transform = True,\
  log_scaling_0_replacement_value = -16, cmap = "mako", 
  row_cluster=False, col_cluster = False,\
  z_score = None, dpi=300,figsize = (480,120),fontsize=2):
    """Generate a taxonomy heatmap
    input_path -- a taxonomy csv file
    output_path -- the graph to output
    raw_data_path -- path to save raw data .tsv to
    log2_transform -- if True, log2 transform data (e.g. 50% => -2, 25% => -3, etc) and 
      replace 0 values (which cannot be log transformed) with a low number specified by log_scaling_0_replacement_value
    log_scaling_0_replacement_value -- what to replace 0 values with.
    cmap - colorscheme to use. Examples: "Blues","Blues_r","veridis",etc
    taxonomy_csv -- the path to the input taxonomy csv
    output_name -- the name of the output file
    z_score -- can be False (no Z-score normalization), 1 (normalize column by z-score), or 0 (normalize rows)
    standard_score -- can be False (no standard score normalization), 1 (normalize columns), or 0 (normalize rows)
    """
    
    #Load the data and set an index
    data = pd.read_csv(input_path)
    labels = pd.DataFrame(data.loc[:,"index"])
    data.rename(columns={"Unassigned;__":"d__Unassigned"},inplace=True)
    #Remove metadata columns by selecting only taxonomy columns
    data = data.loc[:, data.columns.str.startswith("d__")]
    
    #Add back on the feature labels
    data = labels.join(data,how="left")
    data.rename(columns={"index":"SampleID"},inplace=True)
    data.set_index("SampleID",inplace=True)
    
    #Since these data are unrarified, normalize by proportion in each sample
    data = data.div(data.sum(axis=1), axis=0)
    
    if log2_transform:
        #take the log of the dataframe, substituting 0's (which are undefined in log space)
        #hattip to stackoverflow: 
        #https://stackoverflow.com/questions/49207688/pandas-efficiently-avoid-0s-when-taking-log-of-cells-in-dataframe
        def get_log(df):
            return (df.mask(df == 0).applymap(math.log2).fillna(log_scaling_0_replacement_value))
        data = get_log(data)
        
        

    #Switch rows and colums
    data = data.transpose(copy=True)
    
    #Save exact copy of data used in figure
    #AFTER all relevant transformations have been applied 
    #(other than clustering and z_score if selected)
    print("Saving raw data to output_path:")
    data.to_csv(raw_data_path,sep="\t")
    
      
    #Make the graph
    plt.clf()
    graph = sns.clustermap(data = data,cmap=cmap,metric = "correlation",z_score=z_score,xticklabels=1,yticklabels=1,
      row_cluster=row_cluster,col_cluster=col_cluster)
    
 
    plt.setp(graph.ax_heatmap.get_yticklabels(), fontsize=fontsize)
    
    #Save result
    print("Saving file to output path:",output_path)
    graph.savefig(output_path,
            dpi=dpi, figsize=figsize)
    plt.clf()
    
    

In [21]:

    
levels_to_analyze = [2,3,4,5,7] 
for level in levels_to_analyze:
    print(f"Analyzing taxonomy at level {level}")
    current_file = f"level-{level}.csv"
    input_path = join(taxonomy_folder,current_file)
    
    #Use smaller font for more specific levels
    fontsize = round(14.0/level)
    
    #Output log transformed results
    
    #Set the cmap to a dark colorscheme which looks better with 
    #log data
    cmap = "mako"
    
    output_path = join(output_folder,f'heatmap_of_{current_file}_log2.jpg')
    raw_data_path = join(output_folder,f'raw_data_of_{current_file}_log2.tsv')
    make_taxonomy_heatmap(input_path,output_path,raw_data_path,log2_transform=True,fontsize=fontsize,cmap=cmap)
    
    output_path = join(output_folder,f'heatmap_of_{current_file}_log2_row_cluster.jpg')
    raw_data_path = join(output_folder,f'raw_data_of_{current_file}_log2_row_cluster.tsv')
    make_taxonomy_heatmap(input_path,output_path,raw_data_path,log2_transform=True,row_cluster=True,fontsize=fontsize,cmap=cmap)
   
    #Output raw clustermaps
    #Use a light colorscheme which makes it easier to see very small numbers as faint 'bands'
    cmap = "Blues"
    output_path = join(output_folder,f'heatmap_of_{current_file}.jpg')
    raw_data_path = join(output_folder,f'raw_data_of_{current_file}.tsv')
    make_taxonomy_heatmap(input_path,output_path,raw_data_path,log2_transform=False,fontsize=fontsize,cmap=cmap)
    
    output_path = join(output_folder,f'heatmap_of_{current_file}_row_cluster.jpg')
    raw_data_path = join(output_folder,f'raw_data_of_{current_file}_row_cluster.tsv')
    make_taxonomy_heatmap(input_path,output_path,raw_data_path,log2_transform=False,row_cluster=True,fontsize=fontsize,cmap=cmap)
    
    

Analyzing taxonomy at level 2
Saving raw data to output_path:
Saving file to output path: /Users/jzaneveld/Dropbox/Zaneveld_Lab_Organization/Projects/Padilla_Gamino_Disease/MWS/output/heatmap_of_level-2.csv_log2.jpg
Saving raw data to output_path:
Saving file to output path: /Users/jzaneveld/Dropbox/Zaneveld_Lab_Organization/Projects/Padilla_Gamino_Disease/MWS/output/heatmap_of_level-2.csv_log2_row_cluster.jpg
Saving raw data to output_path:
Saving file to output path: /Users/jzaneveld/Dropbox/Zaneveld_Lab_Organization/Projects/Padilla_Gamino_Disease/MWS/output/heatmap_of_level-2.csv.jpg
Saving raw data to output_path:
Saving file to output path: /Users/jzaneveld/Dropbox/Zaneveld_Lab_Organization/Projects/Padilla_Gamino_Disease/MWS/output/heatmap_of_level-2.csv_row_cluster.jpg
Analyzing taxonomy at level 3
Saving raw data to output_path:
Saving file to output path: /Users/jzaneveld/Dropbox/Zaneveld_Lab_Organization/Projects/Padilla_Gamino_Disease/MWS/output/heatmap_of_level-3.csv_log2.

<Figure size 432x288 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>