# Reformatting PICRUSt2 results into csv files for prediction and concordance analyses

Quang Nguyen.   
Last updated 2022-04-27. 

In [1]:
import pandas as pd

Process CRC data set

In [61]:
feat = pd.read_csv("../output/picrust2/crc_16s/pathways_out/path_abun_unstrat.tsv.gz", sep = "\t", header=None, 
                  index_col = 0)
meta = pd.read_csv("../metadata/crc_qiime2_metadata.tsv", sep = "\t")

In [62]:
feat = feat.T
feat = feat.rename(columns = {"pathway" : "sample-id"})

In [63]:
meta.diagnosis.value_counts()

Cancer           53
Normal           50
Small adenoma    25
Large adenoma    13
Name: diagnosis, dtype: int64

In [64]:
feat.head()

Unnamed: 0,sample-id,1CMET2-PWY,3-HYDROXYPHENYLACETATE-DEGRADATION-PWY,ALL-CHORISMATE-PWY,ANAEROFRUCAT-PWY,ANAGLYCOLYSIS-PWY,ARG+POLYAMINE-SYN,ARGDEG-PWY,ARGORNPROST-PWY,ARGSYN-PWY,...,THISYN-PWY,THREOCAT-PWY,THRESYN-PWY,TRNA-CHARGING-PWY,TRPSYN-PWY,TYRFUMCAT-PWY,UBISYN-PWY,UDPNAGSYN-PWY,VALDEG-PWY,VALSYN-PWY
1,DE-013,61973.91933251762,67.85731800195911,673.6141185916941,60077.10084855479,75998.57921049336,12238.506185741428,77.88788425889851,5417.421675205796,42005.34332144665,...,47573.82974530525,138.689713491819,73957.86487062651,65015.8404828085,54262.87288743421,223.76597489023533,1487.094100200029,29543.016481934348,0.0,75093.97647569676
2,DE-029,87095.5147461876,3643.129838213508,16102.596458952266,79376.66223894118,110133.19807518156,33986.158956427505,4597.498472849865,16514.1377539049,72920.16762312053,...,66102.6767625961,718.3805838410658,102065.88527844098,95455.8081532026,75179.28574235355,7131.197491124653,14277.211940487115,59824.613320583376,0.0,111519.2158981305
3,DE-031,95868.59080758912,185.4740519809013,1445.6667441393508,101269.57122093816,130964.31204106688,24410.373227526903,297.4228001193948,5642.883596584262,73228.58973755263,...,73548.40437391664,484.5928931839396,114046.5505811318,104778.7606209109,81972.44971688415,146.77763829996474,5799.502595354073,52291.89672594335,20.96821574565228,119840.9833938164
4,DE-034,67249.82708187414,7705.641481573197,19024.389954852995,64159.45608814975,86332.64538316694,33304.75517947081,10517.845389310403,2530.272871565265,59666.43466720876,...,40853.86430631721,6633.384845013459,77622.51557920192,76052.44783402057,60207.68394415295,63.803384733773065,12331.386739700987,57794.85529760696,1202.9145356954589,86135.55807982513
5,DE-037,65139.67051477415,60.73170009372745,709.5503667393648,61555.25871306296,78960.5739579014,19476.78325802615,44.57436803139535,10655.96496168623,61288.3323045651,...,51833.85779736296,73.76899815050776,78948.9283939209,72446.36680705452,57800.80993162157,2869.4021249528405,6724.685099531341,40946.25657172197,26.959435403775636,84413.83120566026


Due to data imbalance, we're going to restrict to only Cancer versus Normal patients, and then we're going to re-arrange rows to match

In [66]:
meta = meta[meta.diagnosis.isin(["Cancer", "Normal"])]
merged = feat.merge(meta, on="sample-id")

In [82]:
met_export = merged[["sample-id", "diagnosis"]]
met_export = met_export.set_index('sample-id')
met_export.to_csv("../data/pred_picrust2_crc_metadata.csv")

In [81]:
feat_export = merged.loc[:,~merged.columns.isin(['diagnosis', 'seq_sample_id', 'age', 
                                                 'forward-absolute-filepath', 'reverse-absolute-filepath'])]
feat_export = feat_export.set_index("sample-id")
feat_export.to_csv("../data/pred_picrust2_crc_feat.csv")

Let's do the same for the remainder data set

In [83]:
feat = pd.read_csv("../output/picrust2/ibd_16s/pathways_out/path_abun_unstrat.tsv.gz", sep = "\t", header=None, 
                  index_col = 0)
meta = pd.read_csv("../metadata/ibd_qiime2_metadata.tsv", sep = "\t")

In [86]:
feat = feat.T
feat = feat.rename(columns = {"pathway" : "sample-id"})

In [88]:
meta.diseasesubtype.value_counts()

iCD    251
no     194
UC      72
cCD     70
IC      34
CD      18
Name: diseasesubtype, dtype: int64

In [91]:
meta = meta[meta.diseasesubtype.isin(["iCD", "no", "cCD", "CD"])][["sample-id", "diseasesubtype"]]
meta = meta.replace({"iCD": "CD", "cCD": "CD"})

In [93]:
meta.diseasesubtype.value_counts()

CD    339
no    194
Name: diseasesubtype, dtype: int64

In [101]:
merge = feat.merge(meta)
meta_export = merge[["sample-id", "diseasesubtype"]]
meta_export = meta_export.set_index('sample-id')
feat_export = merge.loc[:, ~merged.columns.isin(['diseasesubtype'])]
feat_export = feat_export.set_index('sample-id')

In [103]:
feat_export.to_csv("../data/pred_picrust2_ibd_feat.csv")
meta_export.to_csv("../data/pred_picrust2_ibd_metadata.csv")