Add .fai index creation support #842

Merged
merged 6 commits into from Apr 25, 2017
@@ -31,6 +31,9 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.LinkedHashMap;
@@ -39,7 +42,7 @@
import java.util.regex.MatchResult;
/**
- * Reads a fasta index file (.fai), as generated by `samtools faidx`.
+ * Reads/writes a fasta index file (.fai), as generated by `samtools faidx`.
*/
public class FastaSequenceIndex implements Iterable<FastaSequenceIndexEntry> {
/**
@@ -159,6 +162,27 @@ private void parseIndexFile(Path indexFile) {
}
/**
+ * Writes this index to the specified path.
@tfenne

tfenne Apr 5, 2017

Owner

Might also want to update the class level javadoc now that writing is supported!

@magicDGS

magicDGS Apr 5, 2017

Contributor

Done.

+ *
+ * @param indexFile index file to output the index in the .fai format
+ *
+ * @throws IOException if an IO error occurs.
+ */
+ public void write(final Path indexFile) throws IOException {
+ try (final PrintStream writer = new PrintStream(Files.newOutputStream(indexFile))) {
+ sequenceEntries.values().forEach(se ->
+ writer.println(String.join("\t",
+ se.getContig(),
+ String.valueOf(se.getSize()),
+ String.valueOf(se.getLocation()),
+ String.valueOf(se.getBasesPerLine()),
+ String.valueOf(se.getBytesPerLine()))
+ )
+ );
+ }
+ }
+
+ /**
* Does the given contig name have a corresponding entry?
* @param contigName The contig name for which to search.
* @return True if contig name is present; false otherwise.
@@ -0,0 +1,180 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2017 Daniel Gomez-Sanchez
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+package htsjdk.samtools.reference;
+
+import htsjdk.samtools.SAMException;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.util.IOUtil;
+import htsjdk.tribble.readers.AsciiLineReader;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/**
+ * Static methods to create an {@link FastaSequenceIndex}.
+ *
+ * @author Daniel Gomez-Sanchez (magicDGS)
+ */
+public final class FastaSequenceIndexCreator {
+
+ // cannot be instantiated because it is an utility class
+ private FastaSequenceIndexCreator() {}
+
+ /**
+ * Creates a FASTA .fai index for the provided FASTA.
+ *
+ * @param fastaFile the file to build the index from.
+ * @param overwrite if the .fai index already exists override it if {@code true}; otherwise, throws a {@link SAMException}.
+ *
+ * @throws SAMException if the fai file already exists or the file is malformed.
+ * @throws IOException if an IO error occurs.
+ */
+ public static void create(final Path fastaFile, final boolean overwrite) throws IOException {
+ // get the index to write the file in
+ final Path indexFile = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile);
+ if (!overwrite && Files.exists(indexFile)) {
+ // throw an exception if the file already exists
+ throw new SAMException("Index file " + indexFile + " already exists for " + fastaFile);
+ }
+ // build the index
+ final FastaSequenceIndex index = buildFromFasta(fastaFile);
+ index.write(indexFile);
+ }
+
+ /**
+ * Builds a FastaSequenceIndex on the fly from a FASTA file.
+ *
+ * Note: this also alows to create an index for a compressed file, but does not generate the
@tfenne

tfenne Apr 5, 2017

Owner

I tend to think that if we're not going to create the .gzi file too, that this method should probably fail with an exception on a compressed fasta file. I can easily imagine scenarios where code uses this to build a .fai, then turns around to try and query the fasta file, only to find that it cannot because the fasta file is compressed.

@magicDGS

magicDGS Apr 5, 2017

Contributor

I added this note because it is a difference with respect to the samtools faidx command.

I can also imagine scenarios where where a compressed fasta file is already indexed with bgzip (.gzi index), and the only requirement is to create the .fai index. Anyway, if there is a way to create with htsjdk a .gzi index I can integrate it here in the case of block-compressed input.

+ * .gzi index required for use it with samtools.
+ *
+ * @param fastaFile the FASTA file.
+ *
+ * @return a fai index.
+ *
+ * @throws SAMException for formatting errors.
+ * @throws IOException if an IO error occurs.
+ */
+ public static FastaSequenceIndex buildFromFasta(final Path fastaFile) throws IOException {
+ try(final AsciiLineReader in = new AsciiLineReader(IOUtil.openFileForReading(fastaFile))) {
+
+ // sanity check reference format:
+ // 1. Non-empty file
+ // 2. Header name starts with >
+ String previous = in.readLine();
+ if (previous == null) {
+ throw new SAMException("Cannot index empty file: " + fastaFile);
+ } else if (previous.charAt(0) != '>') {
+ throw new SAMException("Wrong sequence header: " + previous);
+ }
+
+ // initialize the sequence index
+ int sequenceIndex = -1;
+ // the location should be kept before iterating over the rest of the lines
+ long location = in.getPosition();
+
+ // initialize an empty index and the entry builder to null
+ final FastaSequenceIndex index = new FastaSequenceIndex();
+ FaiEntryBuilder entry = null;
+
+ // read the lines two by two
+ for (String line = in.readLine(); previous != null; line = in.readLine()) {
+ // in this case, the previous line contains a header and the current line the first sequence
+ if (previous.charAt(0) == '>') {
+ // first entry should be skipped; otherwise it should be added to the index
+ if (entry != null) index.add(entry.build());
+ // creates a new entry (and update sequence index)
+ entry = new FaiEntryBuilder(sequenceIndex++, previous, line, in.getLineTerminatorLength(), location);
+ } else if (line != null && line.charAt(0) == '>') {
+ // update the location, next iteration the sequence will be handled
+ location = in.getPosition();
+ } else if (line != null && !line.isEmpty()) {
+ // update in case it is not a blank-line
+ entry.updateWithSequence(line, in.getLineTerminatorLength());
+ }
+ // set the previous to the current line
+ previous = line;
+ }
+ // add the last entry
+ index.add(entry.build());
+
+ // and return the index
+ return index;
+ }
+ }
+
+ // utility class for building the FastaSequenceIndexEntry
+ private static class FaiEntryBuilder {
+ private final int index;
+ private final String contig;
+ private final long location;
+ // the bytes per line is the bases per line plus the length of the end of the line
+ private final int basesPerLine;
+ private final int endOfLineLength;
+
+ // the size is updated for each line in the input using updateWithSequence
+ private long size;
+ // flag to check if the supposedly last line was already reached
+ private boolean lessBasesFound;
+
+ private FaiEntryBuilder(final int index, final String header, final String firstSequenceLine, final int endOfLineLength, final long location) {
+ if (header == null || header.charAt(0) != '>') {
+ throw new SAMException("Wrong sequence header: " + header);
+ } else if (firstSequenceLine == null) {
+ throw new SAMException("Empty sequences could not be indexed");
+ }
+ this.index = index;
+ // parse the contig name (without the starting '>' and truncating white-spaces)
+ this.contig = SAMSequenceRecord.truncateSequenceName(header.substring(1).trim());
+ this.location = location;
+ this.basesPerLine = firstSequenceLine.length();
+ this.endOfLineLength = endOfLineLength;
+ this.size = firstSequenceLine.length();
+ this.lessBasesFound = false;
+ }
+
+ private void updateWithSequence(final String sequence, final int endOfLineLength) {
+ if (this.endOfLineLength != endOfLineLength) {
+ throw new SAMException(String.format("Different end of line for the same sequence was found."));
+ }
+ if (sequence.length() > basesPerLine) {
+ throw new SAMException(String.format("Sequence line for {} was longer than the expected length ({}): {}",
+ contig, basesPerLine, sequence));
+ } else if (sequence.length() < basesPerLine) {
+ if (lessBasesFound) {
+ throw new SAMException(String.format("Only last line could have less than {} bases for '{}' sequence, but at least two are different. Last sequence line: {}",
+ basesPerLine, contig, sequence));
+ }
+ lessBasesFound = true;
+ }
+ // update size
+ this.size += sequence.length();
+ }
+
+ private FastaSequenceIndexEntry build() {
+ return new FastaSequenceIndexEntry(contig, location, size, basesPerLine, basesPerLine + endOfLineLength, index);
+ }
+ }
+}
@@ -136,18 +136,14 @@ public static boolean canCreateIndexedFastaReader(final File fastaFile) {
}
private static Path findFastaIndex(Path fastaFile) {
- Path indexFile = getFastaIndexFileName(fastaFile);
+ Path indexFile = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile);
if (!Files.exists(indexFile)) return null;
return indexFile;
}
- private static Path getFastaIndexFileName(Path fastaFile) {
- return fastaFile.resolveSibling(fastaFile.getFileName() + ".fai");
- }
-
private static Path findRequiredFastaIndexFile(Path fastaFile) throws FileNotFoundException {
Path ret = findFastaIndex(fastaFile);
- if (ret == null) throw new FileNotFoundException(getFastaIndexFileName(fastaFile) + " not found.");
+ if (ret == null) throw new FileNotFoundException(ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) + " not found.");
return ret;
}
@@ -163,4 +163,13 @@ public static String getFastaExtension(final Path path) {
.orElseGet(() -> {throw new IllegalArgumentException("File is not a supported reference file type: " + path.toAbsolutePath());});
}
+ /**
+ * Returns the index name for a FASTA file.
+ *
+ * @param fastaFile the reference sequence file path.
+ */
+ public static Path getFastaIndexFileName(Path fastaFile) {
+ return fastaFile.resolveSibling(fastaFile.getFileName() + ".fai");
+ }
+
}
@@ -0,0 +1,90 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2017 Daniel Gomez-Sanchez
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+package htsjdk.samtools.reference;
+
+import htsjdk.HtsjdkTest;
+import htsjdk.samtools.util.IOUtil;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * @author Daniel Gomez-Sanchez (magicDGS)
+ */
+public class FastaSequenceIndexCreatorTest extends HtsjdkTest {
+ private static File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/reference");
+
+
+ @DataProvider(name = "indexedSequences")
+ public Object[][] getIndexedSequences() {
+ return new Object[][]{
+ {new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta")},
+ {new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta.gz")},
+ {new File(TEST_DATA_DIR, "header_with_white_space.fasta")},
+ {new File(TEST_DATA_DIR, "crlf.fasta")}
+ };
+ }
+
+ @Test(dataProvider = "indexedSequences")
+ public void testBuildFromFasta(final File indexedFile) throws Exception {
+ final FastaSequenceIndex original = new FastaSequenceIndex(new File(indexedFile.getAbsolutePath() + ".fai"));
+ final FastaSequenceIndex build = FastaSequenceIndexCreator.buildFromFasta(indexedFile.toPath());
+ Assert.assertEquals(original, build);
+ }
+
+ @Test(dataProvider = "indexedSequences")
+ public void testCreate(final File indexedFile) throws Exception {
+ // copy the file to index
+ final File tempDir = IOUtil.createTempDir("FastaSequenceIndexCreatorTest", "testCreate");
+ final File copied = new File(tempDir, indexedFile.getName());
+ copied.deleteOnExit();
+ Files.copy(indexedFile.toPath(), copied.toPath());
+
+ // create the index for the copied file
+ FastaSequenceIndexCreator.create(copied.toPath(), false);
+
+ // test if the expected .fai and the created one are the same
+ final File expectedFai = new File(indexedFile.getAbsolutePath() + ".fai");
+ final File createdFai = new File(copied.getAbsolutePath() + ".fai");
+
+ // read all the files and compare line by line
+ try(final Stream<String> expected = Files.lines(expectedFai.toPath());
+ final Stream<String> created = Files.lines(createdFai.toPath())) {
+ final List<String> expectedLines = expected.filter(String::isEmpty).collect(Collectors.toList());
+ final List<String> createdLines = created.filter(String::isEmpty).collect(Collectors.toList());
+ Assert.assertEquals(expectedLines, createdLines);
+ }
+
+ // load the tmp index and check that both are the same
+ Assert.assertEquals(new FastaSequenceIndex(createdFai), new FastaSequenceIndex(expectedFai));
+ }
+
+}
Oops, something went wrong.