In [31]:
import os
import pandas as pd
import json
from functools import reduce

In [7]:
os.chdir(os.getenv("OUTPUT_DIR"))
params_path = "./params.json"

In [10]:
with open(params_path,"r") as f:
    params = json.load(f)

# data['treatment'] + data['control'] 

In [11]:
def get_metadata_group(db_dict,group,group_name):
    return pd.DataFrame([(item['sample_name'],group_name) for item in db_dict[group] ], columns=['sample_name','group'])

def get_metadata(params):
    df_list = [get_metadata_group(params,group,group_name) for group,group_name in params["groups_name"].items()]
    metadata = pd.concat(df_list, ignore_index=True)
    metadata = metadata.set_index("sample_name")
    return metadata

In [12]:
def get_one_df(file,sample_key):
    df = pd.read_csv(file,sep="\t",comment="#",header=None)
    df.columns = ["clade_name","ncbi_tax_id","relative_abundance","additional_species"]
    df = df.rename({"relative_abundance":sample_key},axis=1)
    df = df.drop(["additional_species","ncbi_tax_id"],axis=1)
    df = df.set_index(["clade_name"])
    return df

In [13]:
samples = sum([params[group] for group in params["groups"]],[])
df_list = [get_one_df(item['profile'],item['sample_name']) for item in samples]

In [14]:
df = reduce(lambda x,y:pd.merge(x,y,left_index=True,right_index=True, how="outer"),df_list)
# df = df.reset_index()
df = df.fillna(0)

In [15]:
metadata = get_metadata(params)

In [17]:
df = df.reset_index().query("not clade_name.str.contains('t__')").set_index("clade_name")

In [19]:
df_merge = pd.merge(df.T,metadata,left_index=True,right_index=True, how="inner" )

In [20]:
df_merge.shape

(20, 1567)

In [21]:
df_res = df_merge.iloc[:, [-1] + list(range(df_merge.shape[1] - 1))].T
df_res = df_res.reset_index().rename(columns={"index":"sample_name"})

In [22]:
df_res.to_csv("matrix.tsv",sep="\t",index=False)
df_res

Unnamed: 0,sample_name,OCC1,OCC6,OCC7,OCC4,OCC3,OCC9,OCC10,OCC5,OCC2,...,YCC2,YCC3,YCC7,YCC6,YCC5,YCC1,YCC9,YCC8,YCC4,YCC10
0,group,OCC,OCC,OCC,OCC,OCC,OCC,OCC,OCC,OCC,...,YCC,YCC,YCC,YCC,YCC,YCC,YCC,YCC,YCC,YCC
1,k__Bacteria,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,k__Bacteria|p__Actinobacteria,0.0,0.0,0.01801,0.0,0.01584,0.00343,0.025,0.0,0.00464,...,0.12092,0.03552,0.19177,1.5576,0.13716,0.01796,0.16938,3.29313,0.14616,1.69004
3,k__Bacteria|p__Actinobacteria|c__Actinomycetia,0.0,0.0,0.01786,0.0,0.0,0.0,0.00616,0.0,0.00319,...,0.00112,0.0,0.00519,0.01399,0.00326,0.0,0.01059,0.02068,0.00031,0.00781
4,k__Bacteria|p__Actinobacteria|c__Actinomycetia...,0.0,0.0,0.0,0.0,0.0,0.0,0.00238,0.0,0.0,...,0.00112,0.0,0.00519,0.01399,0.00326,0.0,0.01059,0.02068,0.00031,0.00781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,k__Bacteria|p__Verrucomicrobia|c__Verrucomicro...,0.0,0.2551,4.32336,0.0,0.0,0.0,0.01828,0.0,0.0,...,1.40818,1.92033,2.25224,2.39966,1.23006,0.72735,0.95369,0.59731,0.18048,0.43558
1563,k__Bacteria|p__Verrucomicrobia|c__Verrucomicro...,0.0,0.2551,4.32336,0.0,0.0,0.0,0.01828,0.0,0.0,...,1.40818,1.92033,2.25224,2.39966,1.23006,0.72735,0.95369,0.59731,0.18048,0.43558
1564,k__Bacteria|p__Verrucomicrobia|c__Verrucomicro...,0.0,0.2551,4.32336,0.0,0.0,0.0,0.01828,0.0,0.0,...,1.40818,1.92033,2.25224,2.39966,1.23006,0.72735,0.95369,0.59731,0.18048,0.43558
1565,k__Bacteria|p__Verrucomicrobia|c__Verrucomicro...,0.0,0.2551,4.32336,0.0,0.0,0.0,0.01828,0.0,0.0,...,1.40818,1.92033,2.25224,2.39966,1.23006,0.72735,0.95369,0.59731,0.18048,0.43558


In [23]:
!lefse_format_input.py matrix.tsv	 matrix.in -c 2 -s 2 -u 1 -o 1000000

In [None]:
!lefse_run.py matrix.in output/matrix.tsv

In [25]:
!pwd

/ssd1/wy/workspace2/nextflow_workspace/289364b1-295c-4710-833e-d68ec7c8918e/d0908e76-edfb-4f4a-9d47-dc1095ffcf59/9b2f5e85-9f3a-43ff-b4c4-293a12cac969


In [27]:
!MPLBACKEND=Agg  lefse_plot_cladogram.py matrix.res output/matrix.cladogram.pdf   --dpi 300
# \
#   --format pdf \
#   --dpi 600 \
#   --label_font_size 14 \
#   --title_font_size 16 \
#   --max_point_size 40 \
#   --colored_labels 1 \
#   --class_legend_vis 0 \
#   --class_legend_font_size 12

clade_sep parameter too large, lowered to 0.200225830078125


In [30]:
!MPLBACKEND=Agg lefse_plot_res.py matrix.res output/matrix.pdf   --dpi 300

In [28]:
# from IPython.display import Image, display
# # 显示图片
# display(Image(filename="./matrix.cladogram.png"))

In [32]:
cat matrix.res

k__Bacteria.p__Actinobacteria	3.0887054078715375	YCC	2.7895283737226015	0.0002561292392215266
k__Bacteria.p__Actinobacteria.c__Actinomycetia	1.0208445521836733			-
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Bifidobacteriales	1.0208445521836733			0.00220164792740509
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Bifidobacteriales.f__Bifidobacteriaceae	1.0208445521836733			0.00220164792740509
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Bifidobacteriales.f__Bifidobacteriaceae.g__Bifidobacterium	1.0208445521836733			0.00220164792740509
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Bifidobacteriales.f__Bifidobacteriaceae.g__Bifidobacterium.s__Bifidobacterium_pseudolongum	1.0208445521836733			0.00220164792740509
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Corynebacteriales	0.6168254994123914			-
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Corynebacteriales.f__Corynebacteriaceae	0.6168254994123914			-
k__Bacteria.p__Actinobacteria.c__Actinomycetia.o__Corynebact