Updated run_abundance.py so that compressed and fastq files could be …

…given as input; also see tipp tutorial
smirarab · Feb 1, 2021 · acb7735 · acb7735
1 parent 6043a1e
commit acb7735
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 33 deletions.
diff --git a/README.TIPP.md b/README.TIPP.md
@@ -79,9 +79,9 @@ To see options for running the script, use the command:
 
 `run_tipp.py -h`
 
-The main output of TIPP is a `_classification.txt` file that annotation for each read. In addition, TIPP outputs a `.json` file with the placements, created according to pplacer format. Please refer to [pplacer website](http://matsen.github.com/pplacer/generated_rst/pplacer.html#json-format-specification) for more information on the format of the `.json` file. Also note that pplacer package provides a program called guppy that can read `.json` files and perform downstream steps such as visualization.
-
-In addition to the `.json` file, TIPP outputs alignments of fragments to reference sets. There could be multiple alignment files created, each corresponding to a different placement subset. 
+The main output of TIPP is a `_classification.txt` file that annotation for each read. 
+TIPP also outputs other information about the fragments, including the alignments (note that there could be multiple alignment files created, each corresponding to a different placement subset) as well as the phylogenetic placements.
+Placements are given in the `.json` file, created according to *pplacer* format. Please refer to [pplacer website](http://matsen.github.com/pplacer/generated_rst/pplacer.html#json-format-specification) for more information on the form at of the `.json` file. Also note that *pplacer* package provides a program called *guppy* that can read `.json` files and  perform downstream steps such as visualization.
 
 By setting `SEPP_DEBUG` environment variable to `True`, you can instruct SEPP to output more information that can be helpful for debugging.  
 

diff --git a/sepp/metagenomics.py b/sepp/metagenomics.py
@@ -1,3 +1,4 @@
+import gzip
 import os
 import sys
 import tempfile
@@ -21,6 +22,45 @@
 levels = ["species", "genus", "family", "order", "class", "phylum"]
 
 
+def to_fasta(input, temp_dir):
+    iszipped = False
+    try:
+        fp = gzip.open(input, 'r')
+        line = fp.readline().decode('utf-8')
+        iszipped = True
+    except OSError:
+        fp = open(input, 'r')
+        line = fp.readline()
+    fp.close()
+
+    if line[0] == '>':
+        if iszipped:
+            print("Input is gzip'ed FASTA file, "
+                  "creating a decompressed version")
+            fiter = fastagz_iter(input)
+        else:
+            print("Input is FASTA file, proceeding to run BLAST")
+            return input
+    elif line[0] == '@':
+        if iszipped:
+            print("Input is gzip'ed FASTQ file, "
+                  "creating a decompressed FASTA-formatted version")
+            fiter = fastqgz_iter(input)
+        else:
+            print("Input is FASTQ file, "
+                  "creating a FASTA-formatted version")
+            fiter = fastq_iter(input)
+
+    # Create a new input file
+    new_input = temp_dir + '/' + input.split('.', 1)[0] + ".fasta"
+    with open(new_input, 'w') as fp:
+        for ff in fiter:
+            fp.write('>' + ff[0] + '\n')
+            fp.write(ff[1] + '\n')
+
+    return new_input
+
+
 def load_reference_package():
     global refpkg
 
@@ -94,6 +134,9 @@ def build_profile(input, output_directory):
 
     temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir'))
 
+    # New option to allow fastq files as input
+    input = to_fasta(input, temp_dir)
+
     if (options().bin == "hmmer"):
         binned_fragments = hmmer_to_markers(input, temp_dir)
     else:
@@ -540,22 +583,47 @@ def hmmer_to_markers(input, temp_dir):
 
 def fasta_iter(fasta_name):
     with open(fasta_name, 'r') as fp:
-        line = fp.readline()
-        if line[0] != '>':
-            raise("Unable to read %s" % fasta_name)
+        xline = fp.readline().strip()
+        # if line[0] != '>':
+        #     raise("Unable to read %s" % fasta_name)
+        nextheader = xline
+        nextseq = []
 
-        nextheader = line.strip()
+        for line in fp:
+            xline = line.strip()
+            if xline[0] == '>':
+                header = nextheader[1:].split(' ')[0]
+                seq = "".join(nextseq)
+                nextheader = xline
+                nextseq = []
+                yield header, seq
+            else:
+                nextseq.append(xline)
+
+        header = nextheader[1:]
+        seq = "".join(nextseq)
+        yield header, seq
+
+
+def fastagz_iter(fasta_name):
+    with gzip.open(fasta_name, 'r') as fp:
+        xline = fp.readline().decode('utf-8').strip()
+        # if line[0] != '>':
+        #     raise("Unable to read %s" % fasta_name)
+        nextheader = xline
         nextseq = []
 
         for line in fp:
-            if line[0] == '>':
-                header = nextheader[1:]
+            xline = line.decode('utf-8').strip()
+            print(xline)
+            if xline[0] == '>':
+                header = nextheader[1:].split(' ')[0]
                 seq = "".join(nextseq)
-                nextheader = line.strip()
+                nextheader = xline
                 nextseq = []
                 yield header, seq
             else:
-                nextseq.append(line.strip())
+                nextseq.append(xline)
 
         header = nextheader[1:]
         seq = "".join(nextseq)
@@ -569,14 +637,36 @@ def fastq_iter(fastq_name):
     """
     with open(fastq_name, 'r') as fp:
         for i, line in enumerate(fp):
-            if i % 4 == 2:
+            xline = line.strip()
+            if i % 4 == 0:
+                # First line is name
+                header = xline[1:].split(' ')[0]
+            elif i % 4 == 1:
+                # Second line is sequence
+                seq = xline
+            elif i % 4 == 2:
+                # Third line is '+'
                 continue
-            elif i % 4 == 0:
-                header = line[1:]
+            else:
+                # Fourth line is quality
+                yield header, seq
+
+
+def fastqgz_iter(fastq_name):
+    with gzip.open(fastq_name, 'r') as fp:
+        for i, line in enumerate(fp):
+            xline = line.decode('utf-8').strip()
+            if i % 4 == 0:
+                # First line is name
+                header = xline[1:].split(' ')[0]
             elif i % 4 == 1:
-                seq = line.strip()
+                # Second line is sequence
+                seq = xline
+            elif i % 4 == 2:
+                # Third line is '+'
+                continue
             else:
-                # qual = line.strip()
+                # Fourth line is quality
                 yield header, seq
 
 

diff --git a/tutorial/tipp-tutorial.md b/tutorial/tipp-tutorial.md
@@ -137,19 +137,25 @@ git pull
 Running TIPP
 ============
 
+The general command for running TIPP for a specific pre-computed marker is:
+```
+run_tipp.py -R <reference_marker> -f <fragment_file>
+```
+
 If your installation is successful, you should be able to run TIPP by running the following command from any location. Open up a terminal window and type: 
 
 ```
 run_tipp.py -h
 ``` 
-
-Running TIPP with the `-h` option produces the list of options available in TIPP. 
-
-The general command for running TIPP for a specific pre-computed marker is:
-
+and
 ```
-run_tipp.py -R <reference_marker> -f <fragment_file>
+run_abundance.py -h
 ```
+Running these scripts with the `-h` option produces the list of options available in TIPP. 
+There are two options that are generally helpful to know about: 
+1. `-x NCPU` allows you to set the number of CPUs used by TIPP to `NCPU` (note that by default TIPP uses all CPUs on the machine)
+2. `-p TMPDIR` allows you to specify the directory to which TIPP writes temporary files (note that by default TIPP writes temporary files to `/tmp/sepp/` and does *NOT* delete these temporary files)
+
 
 Task 1: Performing read classification.
 ---------------------------------------
@@ -168,7 +174,7 @@ NOTE: The TIPP2 reference package includes three reference data sets:
 - markers-v2 = additional markers distributed with the 2014 paper
 - markers-v3 = reference dataset created and benchmarked in the 2020 TIPP paper
 
-This command runs TIPP on the fragmentary sequences that have been binned to the pyrg gene from the reference dataset benchmarked in the 2014 paper. In other words, the TIPP classification algorithm will use the pre-computed alignment and tree that has been estimated on the known bacterial pyrg genes.
+This command runs TIPP on the fragmentary sequences that have *already been binned* to the pyrg gene from the reference dataset benchmarked in the 2014 paper. In other words, the TIPP classification algorithm will use the pre-computed alignment and tree that has been estimated on the known bacterial pyrg genes.
 
 The main output of TIPP is a `output_classification.txt` file that contains the classification of each read.  The classification consists of the name of the read, the NCBI taxonomic id of the classification,the rank of the classification, the name of the classification, and the support of the classification.
 
@@ -195,19 +201,18 @@ run_tipp.py -R markers-v1/pyrg \
             -pt 0.0
 ```
 
-In addition, TIPP outputs a `.json` file with the placements, created according to pplacer format. Please refer to [pplacer website](http://matsen.github.com/pplacer/generated_rst/pplacer.html#json-format-specification) for more information on the format of the josn file. Also note that pplacer package provides a program called guppy that can read `.json` files and perform downstream steps such as visualization.
-
-In addition to the `.json` file, TIPP outputs alignments of fragments to reference sets. There could be multiple alignment files created, each corresponding to a different placement subset. 
+Besides the classification file, TIPP outputs other information about the fragments, including the alignments (note that there could be multiple alignment files created, each corresponding to a different placement subset) as well as the phylogenetic placements.
+Placements are given in the `.json` file, created according to *pplacer* format. Please refer to [pplacer website](http://matsen.github.com/pplacer/generated_rst/pplacer.html#json-format-specification) for more information on the format of the `.json` file. Also note that *pplacer* package provides a program called *guppy* that can read `.json` files and perform downstream steps such as visualization.
 
 Task 2: Converting the result into an abundance profile.
 --------------------------------------------------------
-The classification file lists all possible classifications for a fragment, even if it has very low support.  In some situations, we only want the most highly supported classifications.  We can use a helper tool to convert the classification file into a profile.  
+The classification file lists all possible classifications for a fragment, even if it has very low support. In some situations, we only want the most highly supported classifications.  We can use a helper tool to convert the classification file into a profile.  
 
 ```
 run_tipp_tool.py -g markers-v1/pyrg \
                  -d output_profile \
                  -o markers-v1-pyrg \
-                 -i output_classification.txt \
+                 -i output_lower_threshold_classification.txt \
                  -t 0.95
 ```
 
@@ -228,7 +233,7 @@ EAS25_26_1_16_1241_1747_0_2	NA	NA	NA	NA	NA	28890
 
 This file lists the classification (shown as NCBI taxonomic ids) of each fragment at each of the taxonomic rankings.  If a fragment does not meet the support threshold (95% in this case), it will be left as unclassified (NA).  
 
-Let's look at `abundance.species.csv`.  The file shows the abundance profiles for the species level.  The file shows that 65% of the reads belong to the species Methanobrevibacter smithii and 35% of the fragments were unclassified at the species level.
+Let's look at `output_profile/abundance.species.csv`. The file shows the abundance profiles for the species level.  The file shows that 65% of the reads belong to the species Methanobrevibacter smithii and 35% of the fragments were unclassified at the species level.
 ```
 taxa	abundance
 Methanobrevibacter smithii	0.6456
@@ -248,10 +253,15 @@ The general command for `run_abundance.py` is:
 run_abundance.py -f <fragment file> -d <output directory>
 ```
 
-The input fragment files must be in a format accepted by BLAST (i.e. a decompressed FASTA file with no spaces in the read names).
-The output are tab delimited files that estimate the abundance at a given taxonomic level. 
+The input fragment files must be a decompressed FASTA file with no spaces in the read names and meet any other naming restrictions required by BLAST.
+The output are tab delimited files that estimate the abundance at a given taxonomic level.
+
+Recently, we have tried to improve the user experience by allowing a (gzip'ed or decompressed) FASTQ file to be given as input.
+Note that this option 
++ has not be widely tested yet and
++ requires a lot of storage... because it operates by creating a temporary file (i.e. a decompressed FASTA file) to be given to BLAST as input (note that this implementation is the result of BLAST not allowing compressed files and the `blastall` tool not currently being supported).
 
-By default, this command will use the reference dataset benchmarked in the 2020 paper. This is equivalent to specifying the option `-g markers-v3`; you can use the markers benchmarked in the 2014 paper by adding the `-G markers-v1` option.
+By default, `run_abundance.py` will use the reference dataset benchmarked in the 2020 paper. This is equivalent to specifying the option `-g markers-v3`; you can use the markers benchmarked in the 2014 paper by adding the `-G markers-v1` option.
 
 For example, go to the `test/unittest/data/mock/mixed` directory 
 ```
@@ -330,7 +340,8 @@ Thus, specialized marker datasets can be generated for any organisms, not just b
 Task 5: Running a 16S amplicon analysis.
 ----------------------------------------
 
-Finally, we have included a 16S reference marker gene that can be used to analyze 16S amplicon data. Below is an example of running TIPP on 16S amplicon data.
+The 16S reference package has not been updated since 2014, so we caution against using this reference package. For completeness, we include the commands in the original TIPP tutorial.
+Below is an example of running TIPP on 16S amplicon data.
 ```
 run_tipp.py -R 16S-bacteria-v1/16S_bacteria \
             -f test/unittest/data/mock/16S_bacteria/human_gut_16S.fas \