In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu

In [None]:
# demo data are at https://github.com/shuzhao-li/khipu/tree/main/testdata/
# Have a look via pd
ecoli = pd.read_table("../../Datasets/ecoli_pos.tsv", sep="\t", index_col=0)
ecoli.head()

In [None]:
# t-test of each row between 12C and 13C samples
# data are log2 transformed in this test to have more normal distribution; +1 to avoid log2(0)


def ttest(row):
    t,p = ttest_ind(np.log2(row[3:6]+1), np.log2(row[6:9]+1))
    return p

pvalues_featurelist = ecoli.apply(ttest, axis=1)
pvalues_featurelist = pvalues_featurelist.sort_values()
pvalues_featurelist.head(10)

In [None]:
most_significant = pvalues_featurelist.index[0]
print(most_significant)

ecoli2 = pd.read_table("../../Datasets/ecoli_pos.tsv", sep="\t")
ecoli2.head()

ft_lookup = {}
for x in ecoli2.to_dict(orient='records'):
    ft_lookup[x['id_number']] = x

print(ft_lookup)
all_samples = [x for x in ft_lookup[most_significant].keys() if "Ecoli" in x]
print(all_samples)
C12_samples = [x for x in all_samples if "12C" in x]
C13_samples = [x for x in all_samples if "13C" in x]


In [None]:
# for plotting, seaborn is an excellent choice as it natively can produce many of the plots a metabolomics data scientist would like to make.
# seaborn is actually just a wrapper around matplotlib, so everything you know about matplotlib applies to seaborn too (with some tweaking)

import seaborn as sns
import numpy as np

# Generate some data
data = np.random.normal(10, 2, 100)

# Create the box plot
sns.boxplot(data=data, showfliers=False)

# Add scatter points
sns.stripplot(data=data, color="black", size=4, jitter=0.2)

# here we can do some matplotlib fun and add titles and axes labels
plt.title("Example BoxPlot")
plt.ylabel("Intensity")

plt.show()

In [None]:
#lets look at the 

values = [np.log2(ft_lookup[most_significant][x] + 1) for x in all_samples]
sns.boxplot(values)
sns.stripplot(values)
plt.title(most_significant)
plt.ylabel("intensity")
plt.show()

In [None]:
# volcano plot

ALPHA = 0.05 # significance threshold
FC_CUTOFF = 2 # fold change cutoff

fold_changes = []
p_values = []
colors = []
for ft_id, ft in ft_lookup.items():
    C12_values = np.array([ft[x] for x in C12_samples])
    C13_values = np.array([ft[x] for x in C13_samples])
    fc = np.mean(np.log2(C12_values+1)) - np.mean(np.log2(C13_values+1))
    t, p_val = ttest_ind(np.log2(C12_values+1), np.log2(C13_values+1))
    fold_changes.append(fc)
    p_values.append(-np.log10(p_val))
    if p_val < ALPHA and abs(fc) > FC_CUTOFF:
        colors.append('r')
    else:
        colors.append('k')
plt.ylabel("-log10(p)")
plt.xlabel("log fold change")
plt.scatter(fold_changes, p_values, c=colors)
plt.show()

In [None]:
# now lets make a clustermap. A clustermap is like a heatmap, but the samples are organized based on some linkage between features.
# with the default parameters we get something like this:

sns.clustermap(np.log2(ecoli2[all_samples]+1))
plt.suptitle("clustermap log2 transformed")

In [None]:
# in many cases, the z-score is a more appropriate value on which to do the linkage, fortunately, seaborn can do that for you: 


# we can also add column colors to show the grouping based on a variable of interest, in this case, the labelling status of the samples

col_colors = ['k' if x in C12_samples else 'r' for x in all_samples]
sns.clustermap(np.log2(ecoli2[all_samples]+1), z_score=1, col_colors=col_colors)
plt.suptitle("clustermap log2 transformed")

# we can see that samples clearly cluster on the basis of labelling status which is expected