Support use of Path in ParsingUtils, SeekableStreamFactory, AbstractVCFCodec #724

Merged
merged 1 commit into from Jan 20, 2017
Jump to file or symbol
Failed to load files and symbols.
+125 −5
Split
View
@@ -42,6 +42,7 @@ dependencies {
compile "gov.nih.nlm.ncbi:ngs-java:1.2.4"
testCompile "org.testng:testng:6.9.9"
+ testCompile "com.google.jimfs:jimfs:1.1"
}
sourceCompatibility = 1.8
@@ -29,6 +29,7 @@
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.sra.SRAAccession;
+import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Lazy;
import htsjdk.samtools.util.RuntimeIOException;
@@ -366,8 +367,8 @@ public File asFile() {
@Override
public Path asPath() {
try {
- return Paths.get(urlResource.toURI());
- } catch (URISyntaxException | IllegalArgumentException |
+ return IOUtil.getPath(urlResource.toExternalForm());
+ } catch (IOException | IllegalArgumentException |
FileSystemNotFoundException | SecurityException e) {
return null;
}
@@ -23,6 +23,7 @@
*/
package htsjdk.samtools.seekablestream;
+import htsjdk.samtools.util.IOUtil;
import java.io.File;
import java.io.IOException;
import java.net.URL;
@@ -78,6 +79,8 @@ public SeekableStream getStreamFor(final String path) throws IOException {
return new SeekableFTPStream(new URL(path));
} else if (path.startsWith("file:")) {
return new SeekableFileStream(new File(new URL(path).getPath()));
+ } else if (IOUtil.hasScheme(path)) {
+ return new SeekablePathStream(IOUtil.getPath(path));
} else {
return new SeekableFileStream(new File(path));
}
@@ -48,14 +48,20 @@
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
+import java.nio.file.FileSystemNotFoundException;
+import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
@@ -943,4 +949,42 @@ public static String slurp(final InputStream is, final Charset charSet) {
return output;
}
+
+ /**
+ * Check if the given URI has a scheme.
+ *
+ * @param uriString the URI to check
+ * @return <code>true</code> if the given URI has a scheme, <code>false</code> if
+ * not, or if the URI is malformed.
+ */
+ public static boolean hasScheme(String uriString) {
+ try {
+ return new URI(uriString).getScheme() != null;
+ } catch (URISyntaxException e) {
+ return false;
+ }
+ }
+
+ /**
+ * Converts the given URI to a {@link Path} object. If the filesystem cannot be found in the usual way, then attempt
+ * to load the filesystem provider using the thread context classloader. This is needed when the filesystem
+ * provider is loaded using a URL classloader (e.g. in spark-submit).
+ *
+ * @param uriString the URI to convert
+ * @return the resulting {@code Path}
+ * @throws IOException an I/O error occurs creating the file system
+ */
+ public static Path getPath(String uriString) throws IOException {
@lbergelson

lbergelson Dec 16, 2016

Contributor

could you replace the use of Paths.get in SamInputResource.UrlInputResource with a call to this instead? Just in case...

@tomwhite

tomwhite Jan 17, 2017

Contributor

Done

+ URI uri = URI.create(uriString);
+ try {
+ // if the URI has no scheme, then treat as a local file, otherwise use the scheme to determine the filesystem to use
+ return uri.getScheme() == null ? Paths.get(uriString) : Paths.get(uri);
+ } catch (FileSystemNotFoundException e) {
+ ClassLoader cl = Thread.currentThread().getContextClassLoader();
+ if (cl == null) {
+ throw e;
+ }
+ return FileSystems.newFileSystem(uri, new HashMap<>(), cl).provider().getPath(uri);
@lbergelson

lbergelson Dec 16, 2016

Contributor

I think there's a bug here. I tested this code explicitly by including the gcs provider in the hellbender path and running the following tests:

    @Test
    public void testCompletePath() throws IOException {
        newFilesystem(URI.create("gs://hellbender/somefile.txt"));
    }

    @Test
    public void testBucketOnly() throws IOException {
        newFilesystem(URI.create("gs://hellbender"));
    }

    private void newFilesystem(URI uri) throws IOException {
        ClassLoader cl = Thread.currentThread().getContextClassLoader();
        final FileSystem fileSystem = FileSystems.newFileSystem(uri, new HashMap<>(), cl);
    }

the first test fails with

java.lang.IllegalArgumentException: GCS FileSystem URIs mustn't have: port, userinfo, path, query, or fragment: gs://hellbender/somefile.txt

	at shaded.cloud-nio.com.google.common.base.Preconditions.checkArgument(Preconditions.java:146)
	at com.google.cloud.storage.contrib.nio.CloudStorageFileSystemProvider.newFileSystem(CloudStorageFileSystemProvider.java:192)
	at com.google.cloud.storage.contrib.nio.CloudStorageFileSystemProvider.newFileSystem(CloudStorageFileSystemProvider.java:83)
	at java.nio.file.FileSystems.newFileSystem(FileSystems.java:326)
	at htsjdk.tribble.util.ParsingUtilsTest.newFilesystem(ParsingUtilsTest.java:179)
	at htsjdk.tribble.util.ParsingUtilsTest.testCompletePath(ParsingUtilsTest.java:169)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.testng.internal.MethodInvocationHelper.invokeMethod(MethodInvocationHelper.java:85)
	at org.testng.internal.Invoker.invokeMethod(Invoker.java:639)
	at org.testng.internal.Invoker.invokeTestMethod(Invoker.java:816)
	at org.testng.internal.Invoker.invokeTestMethods(Invoker.java:1124)
	at org.testng.internal.TestMethodWorker.invokeTestMethods(TestMethodWorker.java:125)
	at org.testng.internal.TestMethodWorker.run(TestMethodWorker.java:108)
	at org.testng.TestRunner.privateRun(TestRunner.java:774)
	at org.testng.TestRunner.run(TestRunner.java:624)
	at org.testng.SuiteRunner.runTest(SuiteRunner.java:359)
	at org.testng.SuiteRunner.runSequentially(SuiteRunner.java:354)
	at org.testng.SuiteRunner.privateRun(SuiteRunner.java:312)
	at org.testng.SuiteRunner.run(SuiteRunner.java:261)
	at org.testng.SuiteRunnerWorker.runSuite(SuiteRunnerWorker.java:52)
	at org.testng.SuiteRunnerWorker.run(SuiteRunnerWorker.java:86)
	at org.testng.TestNG.runSuitesSequentially(TestNG.java:1215)
	at org.testng.TestNG.runSuitesLocally(TestNG.java:1140)
	at org.testng.TestNG.run(TestNG.java:1048)
	at org.testng.IDEARemoteTestNG.run(IDEARemoteTestNG.java:72)
	at org.testng.RemoteTestNGStarter.main(RemoteTestNGStarter.java:127)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)

The second passes. So we can't just use the URI that's passed in to open the new file system. I'm not sure if this is a bug in the implementation of the GCS fileSystemProvider or if it's going to be a consistent issue with multiple providers. The other providers that I know of don't offer consistent behavior here that I can tell. UnixFileSystem provider always throws because it doesn't want to be reinstantiated. The HDFS provider from hdfs.jsr203 seems to be more robust and just picks out the part of the URI that it needs.

@jean-philippe-martin Can you comment on this? It looks like a simple patch to the gcs provider would make it so that this approach would work.

@tomwhite Have you tested this running in spark? I don't see how it can work, but maybe I'm missing something important. I do see that our implementation of this method in gatk has a special case for gcs still.

@tomwhite

tomwhite Dec 19, 2016

Contributor

I did try running this on HDFS with Spark a while ago and it worked. There's a fix for the GCS issue from @jean-philippe-martin here: GoogleCloudPlatform/google-cloud-java#1470

@jean-philippe-martin

jean-philippe-martin Jan 18, 2017

Contributor

@lbergerson: yes this was a bug, it's fixed now.

@tomwhite: Does this mean that on Spark after someone calls getPath on a gs:// URI, the fs provider will be loaded and Path.get will work correctly? Because that would be sweet.

+ }
+ }
}
@@ -23,6 +23,7 @@
*/
package htsjdk.tribble.util;
+import htsjdk.samtools.util.IOUtil;
import java.awt.Color;
import java.io.File;
import java.io.FileInputStream;
@@ -31,6 +32,7 @@
import java.lang.reflect.Constructor;
import java.net.MalformedURLException;
import java.net.URL;
+import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -83,6 +85,8 @@ public static InputStream openInputStream(String path)
if (path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:")) {
inputStream = getURLHelper(new URL(path)).openInputStream();
+ } else if (IOUtil.hasScheme(path)) {
+ inputStream = Files.newInputStream(IOUtil.getPath(path));
} else {
File file = new File(path);
inputStream = new FileInputStream(file);
@@ -400,6 +404,8 @@ public static boolean resourceExists(String resource) throws IOException{
}
URLHelper helper = getURLHelper(url);
return helper.exists();
+ } else if (IOUtil.hasScheme(resource)) {
+ return Files.exists(IOUtil.getPath(resource));
} else {
return (new File(resource)).exists();
}
@@ -26,6 +26,7 @@
package htsjdk.variant.vcf;
import htsjdk.samtools.util.BlockCompressedInputStream;
+import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.NameAwareCodec;
@@ -45,6 +46,8 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -616,10 +619,11 @@ private static void parseSingleAltAllele(List<Allele> alleles, String alt, int l
public static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) {
try {
+ Path path = IOUtil.getPath(potentialInput);
//isVCFStream closes the stream that's passed in
- return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
- isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) ||
- isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
+ return isVCFStream(Files.newInputStream(path), MAGIC_HEADER_LINE) ||
+ isVCFStream(new GZIPInputStream(Files.newInputStream(path)), MAGIC_HEADER_LINE) ||
+ isVCFStream(new BlockCompressedInputStream(Files.newInputStream(path)), MAGIC_HEADER_LINE);
} catch ( FileNotFoundException e ) {
return false;
} catch ( IOException e ) {
@@ -1,6 +1,16 @@
package htsjdk.tribble.util;
+import com.google.common.jimfs.Configuration;
+import com.google.common.jimfs.Jimfs;
+import htsjdk.samtools.util.IOUtil;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.file.FileSystem;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.testng.Assert;
import org.testng.annotations.Test;
@@ -118,6 +128,37 @@ public void testSplitJoinEmptyFirst() {
}
@Test
+ public void testFileDoesExist() throws IOException{
+ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp");
+ tempFile.deleteOnExit();
+ tstExists(tempFile.getAbsolutePath(), true);
+ tstExists(tempFile.toURI().toString(), true);
+ }
+
+ @Test
+ public void testFileDoesNotExist() throws IOException{
+ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp");
+ tempFile.delete();
+ tstExists(tempFile.getAbsolutePath(), false);
+ tstExists(tempFile.toURI().toString(), false);
+ }
+
+ @Test
+ public void testInMemoryNioFileDoesExist() throws IOException{
+ FileSystem fs = Jimfs.newFileSystem(Configuration.unix());
+ Path file = fs.getPath("/file");
+ Files.createFile(file);
+ tstExists(file.toUri().toString(), true);
+ }
+
+ @Test
+ public void testInMemoryNioFileDoesNotExist() throws IOException{
+ FileSystem fs = Jimfs.newFileSystem(Configuration.unix());
+ Path file = fs.getPath("/file");
+ tstExists(file.toUri().toString(), false);
+ }
+
+ @Test
public void testFTPDoesExist() throws IOException{
tstExists(AVAILABLE_FTP_URL, true);
}
@@ -143,6 +184,26 @@ private void tstExists(String path, boolean expectExists) throws IOException{
}
@Test
@lbergelson

lbergelson Dec 16, 2016

Contributor

I think we need a way of testing non-file URI's. I'm not sure the best way to go about it. One possibility would be to include one of the alternative filesystem providers as a test dependency. Either that or maybe we could mock one. There is a [google in memory filesystem] that might make a good test for the non-file based file systems.

@lbergelson

lbergelson Dec 16, 2016

Contributor

@droazen What do you think about including a new test dependency for providing alternative paths. It might help us avoid problems like the index one we saw before.

@tomwhite

tomwhite Jan 17, 2017

Contributor

Good idea. I've added a test dependency on https://github.com/google/jimfs, and a few tests that use non-file URIs that exercise the NIO file path.

+ public void testFileOpenInputStream() throws IOException{
+ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp");
+ tempFile.deleteOnExit();
+ OutputStream os = IOUtil.openFileForWriting(tempFile);
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(os));
+ writer.write("hello");
+ writer.close();
+ tstStream(tempFile.getAbsolutePath());
+ tstStream(tempFile.toURI().toString());
+ }
+
+ @Test
+ public void testInMemoryNioFileOpenInputStream() throws IOException{
+ FileSystem fs = Jimfs.newFileSystem(Configuration.unix());
+ Path file = fs.getPath("/file");
+ Files.write(file, "hello".getBytes("UTF-8"));
+ tstStream(file.toUri().toString());
+ }
+
+ @Test
public void testFTPOpenInputStream() throws IOException{
tstStream(AVAILABLE_FTP_URL);
}