## This notebook breaks down classification systems by number of molecules in each group

In [1]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

- Inputs

In [7]:
p_root_dir = Path(ROOT_DIR).parent
p_analysis = p_root_dir / "6_plots\q1_plots"

# standards
p_compounds = p_root_dir / "5_data_analysis/compounds_ids.csv"
p_chem_class = p_root_dir / "5_data_analysis/custom_classification.csv"
p_alt_class = p_root_dir / "5_data_analysis/alternative_classification.csv"
p_pathways = p_root_dir / "5_data_analysis/pathways.csv"

- Load and merge class data

In [9]:
compounds = pd.read_csv(p_compounds, index_col='internal_id')
compounds = compounds[compounds.hmdb_primary != 'custom'][['name_short']]

chem_class = pd.read_csv(p_chem_class, index_col='internal_id')
alt_class = pd.read_csv(p_alt_class, index_col='internal_id')
pathways = pd.read_csv(p_pathways, index_col='internal_id')

df = pd.merge(compounds, chem_class.drop('name_short', axis=1), how='left', left_index=True, right_index=True)
df = pd.merge(df, alt_class.drop('name_short', axis=1), how='left',  left_index=True, right_index=True)
df = pd.merge(df, pathways.drop('name_short', axis=1), how='left', left_index=True, right_index=True)

- Sort rows so that they organise correctly in the plot:

Coarse classes alphabetically, fine classes by number of molecule they contain

In [10]:
df['n_molecules'] = 1

grouped_df = df.groupby(['fine_class']).agg({
    'coarse_class':'first',
    'n_molecules':'count'
})

# Sort coarse classes in alphabetical order
coarse_classes = np.sort(df.coarse_class.unique())

# Fine classes by number of molecules
custom_order = [grouped_df[grouped_df.coarse_class == x].sort_values('n_molecules', ascending=False).index.values for x in coarse_classes]
custom_order = list(np.concatenate(custom_order)) #convert to list

# Get the correct row order for ungrouped dataframe
order_column = [custom_order.index(x) for x in df.fine_class]
df['order'] = order_column

# Reorder rows of dataframe
df_sorted = df.sort_values('order', ascending = False)

# Convert class column values to categorical
# Without this step plotnine will ignore row order in the dataframe
df_sorted['fine_class'] = pd.Categorical(df_sorted.fine_class, categories=pd.unique(df_sorted.fine_class))

- Plot 

In [11]:
p = (ggplot(df_sorted)         # defining what data to use
 + aes(x='fine_class')    # defining what variable to use
 + geom_bar(aes(fill = 'coarse_class')) # defining the type of plot to use
 + coord_flip()
 + labs(x = "Chemical subclass", y = "Number of compounds", fill = "Chemical class")
 + scale_fill_brewer(type="qual", palette="Dark2")
)

p.save(p_analysis / "molecules_per_chemical_class.png")



- Repeat for pathway plot

In [12]:
grouped_df = df.groupby(['fine_path']).agg({
    'coarse_path':'first',
    'n_molecules':'count'
})

# Sort coarse pathways in alphabetical order
coarse_paths = np.sort(df.coarse_path.unique())

# Fine pathways by number of molecules
custom_order = [grouped_df[grouped_df.coarse_path == x].sort_values('n_molecules', ascending=False).index.values for x in coarse_paths]
custom_order = list(np.concatenate(custom_order)) #convert to list

# Get the correct row order for ungrouped dataframe
order_column = [custom_order.index(x) for x in df.fine_path]
df['order'] = order_column

# Reorder rows of dataframe
df_sorted = df.sort_values('order', ascending = False)

# Convert path column values to categorical
df_sorted['fine_path'] = pd.Categorical(df_sorted.fine_path, categories=pd.unique(df_sorted.fine_path))

# Plot
p = (ggplot(df_sorted)        
 + aes(x='fine_path')   
 + geom_bar(aes(fill = 'coarse_path')) 
 + coord_flip()
 + labs(x = "Metabolic pathway", y = "Number of compounds", fill = "Metabolic cluster")
 + scale_fill_brewer(type="seq", palette="YlGnBu")
)

p.save(p_analysis / "molecules_per_pathway.png")



- Make simple bar charts for alternative classification systems

In [19]:
for cat in ['morgan_class', 'hmdb_func_group_class', 'property_class']:

    p= (ggplot(df)         
         + aes(x=cat)    
         + geom_bar() 
         + coord_flip()
         + labs(x = cat, y = "Number of compounds")
        )
    p.save(p_analysis / f"molecules_per_{cat}.png")

