## This notebook breaks down classification systems by number of molecules in each group

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from plotnine import *
import matplotlib.pyplot as plt
from matplotlib import rc

rc('font',**{'family':'sans-serif',
             'sans-serif':['Arial'],
             'size':16})
rc('pdf', fonttype=42)
rc('ps', fonttype=42)

- Inputs

In [2]:
p_root_dir = Path.cwd().parents[1]
p_analysis = p_root_dir / "6_figures\figure_2\classification_bar_charts"

# Compound metadata
p_compounds = p_root_dir / "5_data/compounds_ids.csv"
p_chem_class = p_root_dir / "5_data/custom_classification_v2.csv"
p_pathways = p_root_dir / "5_data/pathways_v2.csv"

- Load and merge class data

In [3]:
compounds = pd.read_csv(p_compounds, index_col='internal_id')
compounds = compounds[compounds.hmdb_primary != 'custom'][['name_short']]

chem_class = pd.read_csv(p_chem_class, index_col='internal_id')
pathways = pd.read_csv(p_pathways, index_col='internal_id')

class_data = pd.merge(compounds, chem_class.drop('name_short', axis=1), how='left', left_index=True, right_index=True)
pathway_data = pd.merge(compounds, pathways.drop('name_short', axis=1), how='left', left_index=True, right_index=True)

- Sort rows so that they organise correctly in the plot:

Coarse classes alphabetically, fine classes by number of molecule they contain

In [4]:
class_data['n_molecules'] = 1

grouped_df = class_data.groupby(['fine_class']).agg({
    'coarse_class':'first',
    'n_molecules':'count'
})

# Sort coarse classes in alphabetical order
coarse_classes = np.sort(class_data.coarse_class.unique())

# Fine classes by number of molecules
custom_order = [grouped_df[grouped_df.coarse_class == x].sort_values('n_molecules', ascending=False).index.values for x in coarse_classes]
custom_order = list(np.concatenate(custom_order)) #convert to list

# Get the correct row order for ungrouped dataframe
order_column = [custom_order.index(x) for x in class_data.fine_class]
class_data['order'] = order_column

# Reorder rows of dataframe
df_sorted = class_data.sort_values('order', ascending = False)

# Convert class column values to categorical
# Without this step plotnine will ignore row order in the dataframe
df_sorted['fine_class'] = pd.Categorical(df_sorted.fine_class, categories=pd.unique(df_sorted.fine_class))

- Plot 

In [5]:
colours = dict({ #Paul Tol palette for colour blindness
     "Amines":'#DDCC77', #yellow
     "Amino acids, peptides, and analogues":'#332288', #blue
     "Carbohydrates":'#117733', #green
     "Carboxylic acids":'#44AA99', #emerald
     "Lipids and lipid-like molecules":'#AA4499', #magenta
     "Nucleosides, nucleotides, and analogues":'#CC6677', #light red
     "Vitamins and cofactors":'#88CCEE', #lightblue
    })

In [6]:
p = (ggplot(df_sorted)    # defining what data to use
 + aes(x='fine_class')    # defining what variable to use
 + geom_bar(aes(fill = 'coarse_class')) # defining the type of plot to use
 + coord_flip()
 + labs(x = "Chemical subclass", y = "Number of compounds", fill = "Chemical class")
#  + scale_fill_brewer(type="qual", palette="Dark2") # choose a palette
 + scale_fill_manual(colours) #define your own palette
 + theme_classic()
 + theme(aspect_ratio=1.7,
         text=element_text(family = 'sans-serif', size=16),
         title=element_text(family = 'sans-serif', size=18),
         legend_position=(0.22, -0.02))
 + guides(fill=guide_legend(ncol = 2))
)

p.save(p_analysis / "molecules_per_chemical_class.png")
save_as_pdf_pages([p], p_analysis / "molecules_per_chemical_class.pdf")



- Repeat for pathway plot

In [7]:
pathway_data['n_molecules'] = 1

grouped_df = pathway_data.groupby(['fine_path']).agg({
    'coarse_path':'first',
    'n_molecules':'count'
})

# Sort coarse pathways in alphabetical order
coarse_paths = np.sort(pathway_data.coarse_path.unique())

# Fine pathways by number of molecules
custom_order = [grouped_df[grouped_df.coarse_path == x].sort_values('n_molecules', ascending=False).index.values for x in coarse_paths]
custom_order = list(np.concatenate(custom_order)) #convert to list

# Get the correct row order for ungrouped dataframe
order_column = [custom_order.index(x) for x in pathway_data.fine_path]
pathway_data['order'] = order_column

# Reorder rows of dataframe
df_sorted = pathway_data.sort_values('order', ascending = False)

# Convert path column values to categorical
df_sorted['fine_path'] = pd.Categorical(df_sorted.fine_path, categories=pd.unique(df_sorted.fine_path))

# Plot
p = (ggplot(df_sorted)        
 + aes(x='fine_path')   
 + geom_bar(aes(fill = 'coarse_path')) 
 + coord_flip()
 + labs(x = "Metabolic pathway", y = "Number of compounds", fill = " ")
 + scale_fill_brewer(type="seq", palette="Greys", direction=-1) #RdPu
 + theme_classic()
 + theme(aspect_ratio=1.96,
         text=element_text(family = 'sans-serif', size=16),
         title=element_text(family = 'sans-serif', size=18),
         legend_position=(0.12, 0.01))
 + guides(fill=guide_legend(ncol = 2))
)

p.save(p_analysis / "molecules_per_pathway.png")
save_as_pdf_pages([p], p_analysis / "molecules_per_pathway.pdf")

