In [2]:
from molmap import loadmap
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
from sklearn.metrics import mean_squared_error, log_loss
import warnings, os

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from copy import copy
from tqdm import tqdm


warnings.filterwarnings("ignore")

from matplotlib.ticker import FormatStrFormatter
#gfc = gfc.rename(columns = {'AUS':"AUS", 'CHI':'CHN', 'FRA':'FRA', 'GER':'DEU', 'USA':'USA'})

sns.set(style='white',  font='sans-serif', font_scale=2)  

In [158]:
def _add_q_level(p):
    '''
    ***P ≤ 0.0001, ***0.0001 < P ≤ 0.001, **: 0.001 <P ≤ 0.01, 
    *: 0.01 < P ≤ 0.05, not significant (ns): 0.05 < P ≤ 1.
    '''
    if p <= 1e-4:
        return '****'
    
    elif 1e-4 < p <= 1e-3:
        return '***'    
    
    elif 1e-3 < p <= 1e-2:
        return '**'
    
    elif 1e-2 < p <= 0.05:
        return '*'    
    
    elif  p > 0.05:
        return 'ns'
    
def _add_fc_level(fc):
    if fc < 0:
        return '↓' 
    elif fc>0:
        return '↑'
    elif fc==0:
        return '0'
    
    
# def _add_fc_level2(fc):
#     if 1 >= fc > 0:
#         return '↑' 
#     elif 2 >= fc > 1:
#         return '↑↑'
    
#     elif 3 >= fc > 2:
#         return '↑↑↑'   
    
#     elif 4 >= fc > 3:
#         return '↑↑↑↑'
    
#     elif fc > 4:
#         return '↑↑↑↑↑'
    
#     if 0 >= fc > -1:
#         return '↓' 
#     elif -1 >= fc > -2 :
#         return '↓↓'
    
#     elif -2 >= fc > -3:
#         return '↓↓↓'   
    
#     elif -3 >= fc > -4:
#         return '↓↓↓↓'
    
#     elif fc < -4:
#         return '↓↓↓↓↓'
    
    
def _add_fc_level2(fc):
    
    if fc > 0:
        if fc <= 0.5 :
            return '~↑'     

        elif 1.5 >= fc > 0.5:
            return '↑' 

        elif 2.5 >= fc > 1.5:
            return '↑↑'

        elif 3.5 >= fc > 2.5:
            return '↑↑↑'   

        elif 4.5 >= fc > 3.5:
            return '↑↑↑↑'

        elif fc > 4.5:
            return '↑↑↑↑↑'
    elif fc == 0.0 :
        return '~'
    
    else:
        if fc >=-0.5 :
            return '~↓'     
        
        elif -0.5 >= fc > -1.5 :
            return '↓'   

        elif -1.5 >= fc > -2.5 :
            return '↓↓'

        elif -2.5 >= fc > -3.5:
            return '↓↓↓'   

        elif -3.5 >= fc > -4.5:
            return '↓↓↓↓'

        elif fc < -4.5:
            return '↓↓↓↓↓'

    
def _get_fc(df):
    return df.apply(lambda x:x.unique()).apply(lambda x:np.mean(x))

In [159]:
task = 'Cirrhosis'
data_path = '../01_data/species_level/%s/' % (task)
dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfy.columns = ['group']
dfx = dfa.T.reset_index(drop=True)
dfx = np.log(dfx+1e-8)

a = dfx.join(dfy)
from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney
from matplotlib.colors import LogNorm, Normalize

## calculate q-values
q = []
for col in dfx.columns:
    res = posthoc_mannwhitney(a = a, val_col = col, group_col = 'group', p_adjust = 'fdr_bh') #, use_continuity=False
    adj_p = res.iloc[0][1]
    q.append(adj_p)
dfq = pd.Series(q,index=dfx.columns).to_frame(name='q')

## calculate generlized fold change
median = a.groupby('group').apply(_get_fc)  
fc = median.loc['cirrhosis'] - median.loc['n']
dffc = fc.to_frame(name='gfc')


## obtain info
df1 = pd.Series(dffc.index, index = dffc.index).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
df1 = df1.rename(columns=level_dict)
dfgs = df1[['genus', 'species']]


dfg = pd.read_csv('./Cirrhosis_results/feature_imp_results.csv', index_col=0)
df_gfc_q = dffc.join(dfq).join(dfgs)
dff = dfg.set_index('v').join(df_gfc_q).sort_values('avg_imp', ascending=False)[[ 'genus', 'species', 'x', 'y', 'avg_imp','q', 'gfc']]
dff.avg_imp = dff.avg_imp.round(1)
dff['q_l'] = dff.q.apply(_add_q_level)
dff['fc_l'] = dff.gfc.apply(_add_fc_level)
dff['fc_l2'] = dff.gfc.apply(_add_fc_level2)
dff['fc(q)'] = dff.fc_l2 +'('+ dff.q_l + ')'
dff.to_excel('./Cirrhosis_results/GFI_final_%s.xlsx' % task)

x = dff.head(20)[['species', 'x', 'y', 'avg_imp','fc(q)']].reset_index(drop=True)
x['rank'] = x.index+1
x[['species', 'x', 'y', 'avg_imp', 'rank','fc(q)']]

Unnamed: 0,species,x,y,avg_imp,rank,fc(q)
0,Clostridium_symbiosum,1,8,7.5,1,↑(****)
1,Haemophilus_parainfluenzae,3,0,7.1,2,↑↑(****)
2,Veillonella_dispar,3,1,4.7,3,↑↑↑(****)
3,Bacteroides_dorei,3,12,4.5,4,↓(**)
4,Bacteroides_coprocola,3,16,4.3,5,↓(*)
5,Veillonella_unclassified,2,0,4.2,6,↑↑↑↑(****)
6,Clostridium_citroniae,2,10,4.1,7,~↑(ns)
7,Ruminococcus_gnavus,1,5,3.7,8,↑(****)
8,Veillonella_parvula,1,0,3.4,9,↑↑↑(****)
9,Bacteroides_ovatus,1,11,3.2,10,~↓(ns)


In [165]:
task = 'IBD'
data_path = '../01_data/species_level/%s/' % (task)
dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfy.columns = ['group']
dfx = dfa.T.reset_index(drop=True)
dfx = np.log(dfx+1e-8)

a = dfx.join(dfy)
from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney
from matplotlib.colors import LogNorm, Normalize

## calculate q-values
q = []
for col in dfx.columns:
    res = posthoc_mannwhitney(a = a, val_col = col, group_col = 'group', p_adjust = 'fdr_bh') #, use_continuity=False
    adj_p = res.iloc[0][1]
    q.append(adj_p)
dfq = pd.Series(q,index=dfx.columns).to_frame(name='q')

## calculate generlized fold change
median = a.groupby('group').apply(_get_fc)  
fc = median.loc['ibd'] - median.loc['n']
dffc = fc.to_frame(name='gfc')


## obtain info
df1 = pd.Series(dffc.index, index = dffc.index).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
df1 = df1.rename(columns=level_dict)
dfgs = df1[['genus', 'species']]


dfg = pd.read_csv('./IBD_results/feature_imp_results.csv', index_col=0)
df_gfc_q = dffc.join(dfq).join(dfgs)
dff = dfg.set_index('v').join(df_gfc_q).sort_values('avg_imp', ascending=False)[[ 'genus', 'species', 'x', 'y', 'avg_imp','q', 'gfc']]
dff.avg_imp = dff.avg_imp.round(1)
dff['q_l'] = dff.q.apply(_add_q_level)
dff['fc_l'] = dff.gfc.apply(_add_fc_level)
dff['fc_l2'] = dff.gfc.apply(_add_fc_level2)
dff['fc(q)'] = dff.fc_l2 +'('+ dff.q_l + ')'
dff.to_excel('./IBD_results/GFI_final_%s.xlsx' % task)

x = dff.head(20)[['species', 'x', 'y', 'avg_imp','fc(q)']].reset_index(drop=True)
x['rank'] = x.index+1
x[['species', 'x', 'y', 'avg_imp', 'rank','fc(q)']].set_index('species')

Unnamed: 0_level_0,x,y,avg_imp,rank,fc(q)
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alistipes_finegoldii,7,10,10.3,1,↓↓↓(****)
Coprococcus_sp_ART55_1,7,6,8.7,2,↓↓↓↓↓(***)
Ruminococcus_lactaris,8,10,7.2,3,↓↓↓(***)
Acidaminococcus_unclassified,5,2,4.6,4,~↑(*)
Subdoligranulum_unclassified,6,6,4.6,5,↓↓(**)
Odoribacter_splanchnicus,6,11,3.6,6,↓(**)
Eubacterium_ventriosum,6,7,3.4,7,↓(**)
Butyrivibrio_crossotus,9,4,3.0,8,↓↓↓↓↓(***)
Eubacterium_rectale,5,7,2.7,9,↓(ns)
Akkermansia_muciniphila,7,9,2.6,10,↓↓(****)


In [166]:
task = 'T2D'
data_path = '../01_data/species_level/%s/' % (task)
dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfy.columns = ['group']
dfx = dfa.T.reset_index(drop=True)
dfx = np.log(dfx+1e-8)

a = dfx.join(dfy)
from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney
from matplotlib.colors import LogNorm, Normalize

## calculate q-values
q = []
for col in dfx.columns:
    res = posthoc_mannwhitney(a = a, val_col = col, group_col = 'group', p_adjust = 'fdr_bh') #, use_continuity=False
    adj_p = res.iloc[0][1]
    q.append(adj_p)
dfq = pd.Series(q,index=dfx.columns).to_frame(name='q')

## calculate generlized fold change
median = a.groupby('group').apply(_get_fc)  
fc = median.loc['t2d'] - median.loc['n']
dffc = fc.to_frame(name='gfc')


## obtain info
df1 = pd.Series(dffc.index, index = dffc.index).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
df1 = df1.rename(columns=level_dict)
dfgs = df1[['genus', 'species']]


dfg = pd.read_csv('./T2D_results/feature_imp_results.csv', index_col=0)
df_gfc_q = dffc.join(dfq).join(dfgs)
dff = dfg.set_index('v').join(df_gfc_q).sort_values('avg_imp', ascending=False)[[ 'genus', 'species', 'x', 'y', 'avg_imp','q', 'gfc']]
dff.avg_imp = dff.avg_imp.round(1)
dff['q_l'] = dff.q.apply(_add_q_level)
dff['fc_l'] = dff.gfc.apply(_add_fc_level)
dff['fc_l2'] = dff.gfc.apply(_add_fc_level2)
dff['fc(q)'] = dff.fc_l2 +'('+ dff.q_l + ')'
dff.to_excel('./T2D_results/GFI_final_%s.xlsx' % task)
#dff.head(20)

x = dff.head(20)[['species', 'x', 'y', 'avg_imp','fc(q)']].reset_index(drop=True)
x['rank'] = x.index+1
x[['species', 'x', 'y', 'avg_imp', 'rank','fc(q)']].set_index('species')

Unnamed: 0_level_0,x,y,avg_imp,rank,fc(q)
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Butyrivibrio_unclassified,5,19,12.7,1,~↓(***)
Bacteroides_vulgatus,15,21,5.4,2,~↓(**)
Lachnospiraceae_bacterium_1_1_57FAA,12,20,5.1,3,~↓(ns)
Bacteroidales_bacterium_ph8,8,21,3.7,4,~↓(ns)
Adlercreutzia_equolifaciens,4,22,3.6,5,~↓(ns)
Lachnospiraceae_bacterium_3_1_46FAA,4,21,3.2,6,~↓(ns)
Bacteroides_plebeius,13,19,2.9,7,↓(ns)
Clostridium_bartlettii,1,16,2.6,8,~↑(ns)
Prevotella_copri,14,19,2.6,9,~↓(ns)
Faecalibacterium_prausnitzii,2,23,2.5,10,↓(***)


In [167]:
    
def _get_fc(df):
    return df.apply(lambda x:x.unique()).apply(lambda x:np.mean(x))

task = 'Obesity'
data_path = '../01_data/species_level/%s/' % (task)
dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfy.columns = ['group']
dfx = dfa.T.reset_index(drop=True)
dfx = np.log(dfx+1e-8)

a = dfx.join(dfy)
from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney
from matplotlib.colors import LogNorm, Normalize

## calculate q-values
q = []
for col in dfx.columns:
    res = posthoc_mannwhitney(a = a, val_col = col, group_col = 'group', p_adjust = 'fdr_bh') #, use_continuity=False
    adj_p = res.iloc[0][1]
    q.append(adj_p)
dfq = pd.Series(q,index=dfx.columns).to_frame(name='q')

## calculate generlized fold change
median = a.groupby('group').apply(_get_fc)  
fc = median.loc['obesity'] - median.loc['leaness']
dffc = fc.to_frame(name='gfc')


## obtain info
df1 = pd.Series(dffc.index, index = dffc.index).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
df1 = df1.rename(columns=level_dict)
dfgs = df1[['genus', 'species']]


dfg = pd.read_csv('./Obesity_results/feature_imp_results.csv', index_col=0)
df_gfc_q = dffc.join(dfq).join(dfgs)
dff = dfg.set_index('v').join(df_gfc_q).sort_values('avg_imp', ascending=False)[[ 'genus', 'species', 'x', 'y', 'avg_imp','q', 'gfc']]
dff.avg_imp = dff.avg_imp.round(1)
dff['q_l'] = dff.q.apply(_add_q_level)
dff['fc_l'] = dff.gfc.apply(_add_fc_level)
dff['fc_l2'] = dff.gfc.apply(_add_fc_level2)
dff['fc(q)'] = dff.fc_l2 +'('+ dff.q_l + ')'
dff.to_excel('./Obesity_results/GFI_final_%s.xlsx' % task)

x = dff.head(20)[['species', 'x', 'y', 'avg_imp','fc(q)']].reset_index(drop=True)
x['rank'] = x.index+1
x[['species', 'x', 'y', 'avg_imp', 'rank','fc(q)']].set_index('species')

Unnamed: 0_level_0,x,y,avg_imp,rank,fc(q)
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ruminococcus_bromii,3,16,16.1,1,↑(*)
Oxalobacter_formigenes,5,15,7.0,2,~↓(*)
Butyrivibrio_crossotus,7,14,5.6,3,~↓(*)
Lachnospiraceae_bacterium_8_1_57FAA,8,15,3.7,4,~↑(**)
Sutterella_wadsworthensis,5,12,2.6,5,↑(ns)
Clostridium_sp_L2_50,7,15,2.6,6,~↑(*)
Bacteroides_pectinophilus,6,15,2.2,7,↓(*)
Akkermansia_muciniphila,5,14,2.2,8,↓(*)
Ruminococcus_lactaris,2,20,2.0,9,~↓(**)
Alistipes_finegoldii,4,11,1.8,10,~↑(ns)


In [169]:

task = 'CRC'
data_path = '../01_data/species_level/%s/' % (task)
dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfy.columns = ['group']
dfx = dfa.T.reset_index(drop=True)
dfx = np.log(dfx+1e-8)

a = dfx.join(dfy)
from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney
from matplotlib.colors import LogNorm, Normalize

a = a[a.group != 'adenoma']
## calculate q-values
q = []
for col in dfx.columns:
    try:
        res = posthoc_mannwhitney(a = a, val_col = col, group_col = 'group', p_adjust = 'fdr_bh') #, use_continuity=False
        adj_p = res.iloc[0][1]
    except:
        adj_p = 1
    q.append(adj_p)
dfq = pd.Series(q,index=dfx.columns).to_frame(name='q')

## calculate generlized fold change
median = a.groupby('group').apply(_get_fc)  
fc = median.loc['cancer'] - median.loc['n']
dffc = fc.to_frame(name='gfc')


## obtain info
df1 = pd.Series(dffc.index, index = dffc.index).apply(lambda x: dict([i.split('__') for i in x.split('|')])).apply(pd.Series)
level_dict = {'k':'kingdom', 'p':'phylum', 'c':'class' ,'o':'order' ,'f':'family' ,'g': 'genus','s': 'species'}
df1 = df1.rename(columns=level_dict)
dfgs = df1[['genus', 'species']]


dfg = pd.read_csv('./CRC_results/feature_imp_results.csv', index_col=0)
df_gfc_q = dffc.join(dfq).join(dfgs)
dff = dfg.set_index('v').join(df_gfc_q).sort_values('cancer_avg_imp', ascending=False)[[ 'genus', 'species', 'x', 'y', 'cancer_avg_imp','q', 'gfc']]
dff.cancer_avg_imp = dff.cancer_avg_imp.round(1)
dff['q_l'] = dff.q.apply(_add_q_level)
dff['fc_l'] = dff.gfc.apply(_add_fc_level)
dff['fc_l2'] = dff.gfc.apply(_add_fc_level2)
dff['fc(q)'] = dff.fc_l2 +'('+ dff.q_l + ')'
dff.to_excel('./CRC_results/GFI_final_%s.xlsx' % task)

x = dff.head(20)[['species', 'x', 'y', 'cancer_avg_imp','fc(q)']].reset_index(drop=True)
x['rank'] = x.index+1
x[['species', 'x', 'y', 'cancer_avg_imp', 'rank','fc(q)']].set_index('species')

Unnamed: 0_level_0,x,y,cancer_avg_imp,rank,fc(q)
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Peptostreptococcus_stomatis,10,8,5.4,1,↑↑↑↑↑(****)
Bacteroides_xylanisolvens,19,11,4.8,2,~↓(ns)
Parvimonas_unclassified,12,8,3.7,3,↑↑↑(**)
Gemella_morbillorum,11,9,3.6,4,↑↑↑(**)
Ruminococcus_bromii,19,5,3.5,5,↓(*)
Streptococcus_salivarius,17,8,3.5,6,↓(**)
Collinsella_aerofaciens,19,12,3.0,7,~↓(ns)
Bacteroides_fragilis,14,4,2.9,8,↑(*)
Bacteroides_finegoldii,20,13,2.8,9,~↓(ns)
Veillonella_unclassified,13,7,2.8,10,↑(ns)


In [170]:
dfy

Unnamed: 0,group
0,n
1,n
2,n
3,cancer
4,n
...,...
129,n
130,adenoma
131,n
132,cancer
