-
Notifications
You must be signed in to change notification settings - Fork 161
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add HDFS support to torch_tb_profiler (#793)
Summary: This is to complete #766 **what** Extend torch_tb_profiler with Hadoop file system, to allow tensorboard to read pytorch profiling result stored on HDFS. The implementation leverages `fsspec` (and pyarrow under the hood) to interact with HDFS. It works with various hdfs and hadoop setup as long as HADOOP_HOME and hadoop lib & bin are correctly configured. **testing done** tested with HDFS installed in my local linux box and also a deployed remote hadoop cluster. Pull Request resolved: #793 Reviewed By: chaekit Differential Revision: D48039682 Pulled By: aaronenyeshi fbshipit-source-id: 8eb80f85c887934bd023d5dce96cb80358254a98
- Loading branch information
1 parent
170d45a
commit 8d4234c
Showing
4 changed files
with
86 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import os | ||
|
||
import fsspec | ||
from fsspec.implementations import arrow | ||
|
||
from .. import utils | ||
from .base import BaseFileSystem, RemotePath, StatData | ||
from .utils import as_bytes, as_text, parse_blob_url | ||
|
||
logger = utils.get_logger() | ||
|
||
class HadoopFileSystem(RemotePath, BaseFileSystem): | ||
def __init__(self) -> None: | ||
super().__init__() | ||
|
||
def get_fs(self) -> arrow.HadoopFileSystem: | ||
return fsspec.filesystem("hdfs") | ||
|
||
def exists(self, filename): | ||
return self.get_fs().exists(filename) | ||
|
||
def read(self, filename, binary_mode=False, size=None, continue_from=None): | ||
fs = self.get_fs() | ||
mode = "rb" if binary_mode else "r" | ||
encoding = None if binary_mode else "utf8" | ||
offset = None | ||
if continue_from is not None: | ||
offset = continue_from.get("opaque_offset", None) | ||
with fs.open(path=filename, mode=mode, encoding=encoding) as f: | ||
if offset is not None: | ||
f.seek(offset) | ||
data = f.read(size) | ||
continuation_token = {"opaque_offset": f.tell()} | ||
return (data, continuation_token) | ||
|
||
def write(self, filename, file_content, binary_mode=False): | ||
fs = self.get_fs() | ||
if binary_mode: | ||
fs.write_bytes(filename, as_bytes(file_content)) | ||
else: | ||
fs.write_text(filename, as_text(file_content), encoding="utf8") | ||
|
||
def glob(self, filename): | ||
return self.get_fs().glob(filename) | ||
|
||
def isdir(self, dirname): | ||
return self.get_fs().isdir(dirname) | ||
|
||
def listdir(self, dirname): | ||
fs = self.get_fs() | ||
full_path = fs.listdir(dirname, detail=False) | ||
# strip the protocol from the root path because the path returned by | ||
# pyarrow listdir is not prefixed with the protocol. | ||
root_path_to_strip = fs._strip_protocol(dirname) | ||
return [os.path.relpath(path, root_path_to_strip) for path in full_path] | ||
|
||
def makedirs(self, path): | ||
return self.get_fs().makedirs(path, exist_ok=True) | ||
|
||
def stat(self, filename): | ||
stat = self.get_fs().stat(filename) | ||
return StatData(stat['size']) | ||
|
||
def support_append(self): | ||
return False | ||
|
||
def download_file(self, file_to_download, file_to_save): | ||
return self.get_fs().download(file_to_download, file_to_save, recursive=True) |