In [None]:
!pip3 install pcpfm

# Patching
!pip install isocor

In [None]:
# now lets make the metadata csv
# here we will use a slightly different dataset that includes an MS2 acquisition, a corrupted mzML file, and some blanks.
# we will do annotation on this dataset and some QAQC
import pandas as pd
import os

metadata_dicts = []
for x in os.listdir("../../Datasets/MT01"):
  if x.endswith(".mzML"):
    metadata_dicts.append({
        "File Name": x.rstrip(".mzML"),
        "Sample Type": "Unknown" if "Blank" not in x else "Blank",
        "Filepath": os.path.join(os.path.abspath("../../Datasets/MT01"), x),
        "Method": "Unknown"
    })
metadata_df = pd.DataFrame(metadata_dicts)
metadata_df.to_csv("adv_metadata.csv")
metadata_df.head()

In [None]:
# now lets assemble the experiment object

!pcpfm assemble -o . -j pcpfm_tutorial_advanced -s ./adv_metadata.csv

In [None]:
# and now run asari
# note that the corrupted mzML file is simply skipped during asari, effectively dropping it from the analysis.

!pcpfm asari -i ./pcpfm_tutorial_advanced

In [None]:
# now lets examine the feature table as we did previously
# here we can load the JSON file within the experiment directory to get the feature table path
import json

exp = json.load(open("./pcpfm_tutorial_advanced/experiment.json"))
exp["feature_tables"]

# see we have two feature tables: 'preferred' and 'full'

In [None]:
ft = pd.read_csv(exp["feature_tables"]["preferred"], sep="\t")
print("Num Samples = ", ft.shape[1]-11)
print("Num Features = ", ft.shape[0])

In [None]:
ft.head()

In [None]:
# lets generate the pdf report to see the QA/QC plots. (This will take some time...)

!pcpfm report -i ./pcpfm_tutorial_advanced/ --color_by='["Sample Type"]' --text_by='["File Name"]'

# you can open the report by clicking on the files to the left, then navigating to the path below:

In [None]:
# In the report, see the z-score count on page 10. Clearly the MSplate10 sample has many more features.
# This sample is not plasma, so we should drop it before normalizing all the other plasma samples.
# similarly we should not include blanks in the normalization.

# This command will drop the blanks from the preferred table and save it to a new table named "preferred_no_blanks"

!pcpfm drop_samples --table_moniker preferred --new_moniker preferred_no_blanks --drop_value Blank --drop_field "Sample Type" -i ./pcpfm_tutorial_advanced/


In [None]:
# now lets drop the MSplate sample by name
# no output will be generated here confirming a sample was dropped

!pcpfm drop_samples --table_moniker preferred_no_blanks --new_moniker cleaned_preferred --drop_name MSplate10_pgpB_b1278_G5_platePN_rep2_HEAT_GEIII_0-829_1125 -i ./pcpfm_tutorial_advanced/

In [None]:
# Now we can normalize the samples based on TIC of common features.
# This is now a reasonable normalization procedure as all samples are now similar matrices

!pcpfm normalize --table_moniker cleaned_preferred --new_moniker normalized -i ./pcpfm_tutorial_advanced/


In [None]:
# we could do more to process the feature table, but lets move on to empirical compounds
# asari builds a default list of empirical compounds when it is ran, but we can build
# a new set in the pipeline. This is useful when you want to customize the rules for
# empCpd construction.

# by default, this will use the ionization mode determined by the pipeline, common adducts
# for that mode, and isotopes up to m+13C3
!pcpfm build_empCpds -i ./pcpfm_tutorial_advanced/ -tm full -em full

In [None]:
# now that we have empirical compounds we can start to annotate them.
# due to license restrictions, you have to download the HMDB, lipid MAPS, and MoNA
# using the pipeline, it does not come pre-installed.

!pcpfm download_extras --accept_licenses True


In [None]:
# now we have multiple empirical compounds:

exp = json.load(open("./pcpfm_tutorial_advanced/experiment.json"))
exp["empCpds"]



In [None]:
# lets show an example of MS2 annotations:
# first we need to line up MS2 acquisitions from another experiment / acquisition to features in this experiment
# by default this uses a 30 second rt tolerance and a 5 ppm m/z tolerance. The 5 ppm tolerance is effectively 10 ppm due to mass error in 
# the features + mass error in the precursor ions. 
# the ms2 dir can be a file or a directory of files

!pcpfm map_ms2 -i ./pcpfm_tutorial_advanced/ -em full -nm MS2_mapped --ms2_dir=../../Datasets/ID_01.mzML

In [None]:
# now lets annotate those mapping. This will use the MoNA orbitrap LC-MS/MS database for 
# the ionization mode determined by the pipeline.
# ignore the warnings, these are due to an underlying library

!pcpfm l2_annotate -i ./pcpfm_tutorial_advanced -em MS2_mapped -nm MoNA_annotated

In [None]:
# with the HMDB and LMSD downloaded, we can now generate Level4, m/z only, annotations to the empCpds as follows

!pcpfm l4_annotate -i ./pcpfm_tutorial_advanced/ -em MoNA_annotated -nm MoNA_HMDB_LMSD_annotated

In [None]:
# At this point, if you are comfortable with JSON, you have completed the processing.
# But if you would prefer to have tables for use with other tools, we can generate these with the following command:

!pcpfm generate_output -i ./pcpfm_tutorial_advanced/ -em MoNA_HMDB_LMSD_annotated -tm cleaned_preferred 

# This will map the MoNA_HMDB_LMSD_annotated empCpd annotations back to the cleaned_preferred feature table we created earlier.

In [None]:
# the outputs will be located in the output subdirectory:

print(os.listdir("./pcpfm_tutorial_advanced/output/"))

# this includes the feature table you specified: 'cleaned_preferred_Feature_table.tsv'
# the empCpd file: 'MoNA_HMDB_LMSD_annotated_empCpds.json'
# the sample_annotation_table: 'sample_annot_table.tsv', this records sample metadata,
# the feature annotation table: 'annotation_table.tsv', this is a record of the annotations
# and the experiment.json: 'experiment.json'

In [None]:
# lets look at the annotations
pd.read_csv("./pcpfm_tutorial_advanced/output/annotation_table.tsv", sep="\t").head()

In [None]:
# lets look at the annotations but only MS2
at = pd.read_csv("./pcpfm_tutorial_advanced/output/annotation_table.tsv", sep="\t")
at.dropna(subset=["msms_score"], inplace=True) 
at.head()

In [None]:
# lets look at the feature table
pd.read_csv("./pcpfm_tutorial_advanced/output/cleaned_preferred_Feature_table.tsv", sep="\t").head()

In [None]:
pd.read_csv("./pcpfm_tutorial_advanced/output/sample_annot_table.tsv", sep="\t").head()

# future versions of the pipeline will use this table for other uses as well