Methods for default sequence dictionary name #774

Merged
merged 3 commits into from Dec 28, 2016
Jump to file or symbol
Failed to load files and symbols.
+71 −36
Split
@@ -93,29 +93,16 @@ protected static Path findSequenceDictionary(final Path path) {
if (path == null) {
return null;
}
- // Try and locate the dictionary
- Path dictionary = path.toAbsolutePath();
- Path dictionaryExt = path.toAbsolutePath();
- boolean fileTypeSupported = false;
- for (final String extension : ReferenceSequenceFileFactory.FASTA_EXTENSIONS) {
- String filename = dictionary.getFileName().toString();
- if (filename.endsWith(extension)) {
- dictionaryExt = dictionary.resolveSibling(filename + IOUtil
- .DICT_FILE_EXTENSION);
- String filenameNoExt = filename.substring(0, filename.lastIndexOf(extension));
- dictionary = dictionary.resolveSibling(filenameNoExt+ IOUtil.DICT_FILE_EXTENSION);
- fileTypeSupported = true;
- break;
- }
- }
- if (!fileTypeSupported)
- throw new IllegalArgumentException("File is not a supported reference file type: " + path.toAbsolutePath());
-
- if (Files.exists(dictionary))
+ // Try and locate the dictionary with the default method
+ final Path dictionary = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(path); path.toAbsolutePath();
+ if (Files.exists(dictionary)) {
return dictionary;
+ }
// try without removing the file extension
- if (Files.exists(dictionaryExt))
+ final Path dictionaryExt = path.resolveSibling(path.getFileName().toString() + IOUtil.DICT_FILE_EXTENSION);
+ if (Files.exists(dictionaryExt)) {
return dictionaryExt;
+ }
else return null;
}
@@ -24,6 +24,8 @@
package htsjdk.samtools.reference;
+import htsjdk.samtools.util.IOUtil;
+
import java.io.File;
import java.io.FileNotFoundException;
import java.nio.file.Path;
@@ -113,24 +115,52 @@ public static ReferenceSequenceFile getReferenceSequenceFile(final Path path, fi
* @param preferIndexed if true attempt to return an indexed reader that supports non-linear traversal, else return the non-indexed reader
*/
public static ReferenceSequenceFile getReferenceSequenceFile(final Path path, final boolean truncateNamesAtWhitespace, final boolean preferIndexed) {
- final String name = path.getFileName().toString();
- for (final String ext : FASTA_EXTENSIONS) {
- if (name.endsWith(ext)) {
- // Using faidx requires truncateNamesAtWhitespace
- if (truncateNamesAtWhitespace && preferIndexed && IndexedFastaSequenceFile.canCreateIndexedFastaReader(path)) {
- try {
- return new IndexedFastaSequenceFile(path);
- }
- catch (final FileNotFoundException e) {
- throw new IllegalStateException("Should never happen, because existence of files has been checked.", e);
- }
- }
- else {
- return new FastaSequenceFile(path, truncateNamesAtWhitespace);
- }
+ // this should thrown an exception if the fasta file is not supported
+ getFastaExtension(path);
+ // Using faidx requires truncateNamesAtWhitespace
+ if (truncateNamesAtWhitespace && preferIndexed && IndexedFastaSequenceFile.canCreateIndexedFastaReader(path)) {
+ try {
+ return new IndexedFastaSequenceFile(path);
+ }
+ catch (final FileNotFoundException e) {
+ throw new IllegalStateException("Should never happen, because existence of files has been checked.", e);
}
+ } else {
+ return new FastaSequenceFile(path, truncateNamesAtWhitespace);
}
+ }
- throw new IllegalArgumentException("File is not a supported reference file type: " + path.toAbsolutePath());
+ /**
+ * Returns the default dictionary name for a FASTA file.
+ *
+ * @param file the reference sequence file on disk.
+ */
+ public static File getDefaultDictionaryForReferenceSequence(final File file) {
+ return getDefaultDictionaryForReferenceSequence(file.toPath()).toFile();
}
+
+ /**
+ * Returns the default dictionary name for a FASTA file.
+ *
+ * @param path the reference sequence file path.
+ */
+ public static Path getDefaultDictionaryForReferenceSequence(final Path path) {
+ final String name = path.getFileName().toString();
+ final int extensionIndex = name.length() - getFastaExtension(path).length();
+ return path.resolveSibling(name.substring(0, extensionIndex) + IOUtil.DICT_FILE_EXTENSION);
+ }
+
+ /**
+ * Returns the FASTA extension for the path.
+ *
+ * @param path the reference sequence file path.
+ *
+ * @throws IllegalArgumentException if the file is not a supported reference file.
+ */
+ public static String getFastaExtension(final Path path) {
@yfarjoun

yfarjoun Dec 16, 2016

Contributor

Path is supposed to allow URLs right? signed URLs have keys after the name of the file....so something like

http://my.web.site/file.fasta?key=abcd12345

should we start thinking about these kinds of URLs, or is it hopeless? we have similar problems with bam/vcfs and their index files...

@magicDGS

magicDGS Dec 27, 2016

Contributor

I don't know about this, maybe it is better to discuss it in #724...

+ final String name = path.getFileName().toString();
+ return FASTA_EXTENSIONS.stream().filter(name::endsWith).findFirst()
+ .orElseGet(() -> {throw new IllegalArgumentException("File is not a supported reference file type: " + path.toAbsolutePath());});
+ }
+
}
@@ -1,6 +1,7 @@
package htsjdk.samtools.reference;
import org.testng.Assert;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
@@ -36,4 +37,21 @@
Assert.assertTrue(f instanceof IndexedFastaSequenceFile, "Got non-indexed reader by default.");
}
+
+ @DataProvider
+ public Object[][] fastaNames() {
+ return new Object[][] {
+ {"break.fa", "break.dict"},
+ {"break.txt.txt", "break.txt.dict"},
+ {"break.fasta.fasta", "break.fasta.dict"},
+ {"break.fa.gz", "break.dict"},
+ {"break.txt.gz.txt.gz", "break.txt.gz.dict"},
+ {"break.fasta.gz.fasta.gz", "break.fasta.gz.dict"}
+ };
+ }
+
+ @Test(dataProvider = "fastaNames")
+ public void testGetDefaultDictionaryForReferenceSequence(final String fastaFile, final String expectedDict) throws Exception {
+ Assert.assertEquals(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(new File(fastaFile)), new File(expectedDict));
+ }
}