forked from intake/intake
/
utils.py
286 lines (217 loc) · 9.07 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# -----------------------------------------------------------------------------
# Copyright (c) 2012 - 2018, Anaconda, Inc. and Intake contributors
# All rights reserved.
#
# The full license is in the LICENSE file, distributed with this software.
# -----------------------------------------------------------------------------
import collections
import collections.abc
import datetime
import os
import sys
import warnings
from collections import OrderedDict
from contextlib import contextmanager
import yaml
def make_path_posix(path):
"""Make path generic"""
if "://" in path:
return path
return path.replace("\\", "/").replace("//", "/")
def no_duplicates_constructor(loader, node, deep=False):
"""Check for duplicate keys while loading YAML
https://gist.github.com/pypt/94d747fe5180851196eb
"""
mapping = {}
for key_node, value_node in node.value:
key = loader.construct_object(key_node, deep=deep)
value = loader.construct_object(value_node, deep=deep)
if key in mapping:
from intake.catalog.exceptions import DuplicateKeyError
raise DuplicateKeyError("while constructing a mapping", node.start_mark, "found duplicate key (%s)" % key, key_node.start_mark)
mapping[key] = value
return loader.construct_mapping(node, deep)
def tuple_constructor(loader, node, deep=False):
return tuple(loader.construct_object(node, deep=deep) for node in node.value)
def represent_dictionary_order(self, dict_data):
return self.represent_mapping("tag:yaml.org,2002:map", dict_data.items())
yaml.add_representer(OrderedDict, represent_dictionary_order)
@contextmanager
def no_duplicate_yaml():
yaml.SafeLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, no_duplicates_constructor)
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:python/tuple", tuple_constructor)
try:
yield
finally:
yaml.SafeLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, yaml.constructor.SafeConstructor.construct_yaml_map)
def yaml_load(stream):
"""Parse YAML in a context where duplicate keys raise exception"""
with no_duplicate_yaml():
return yaml.safe_load(stream)
def classname(ob):
"""Get the object's class's name as package.module.Class"""
import inspect
if inspect.isclass(ob):
return ".".join([ob.__module__, ob.__name__])
else:
return ".".join([ob.__class__.__module__, ob.__class__.__name__])
class DictSerialiseMixin(object):
__tok_cache = None
def __new__(cls, *args, **kwargs):
"""Capture creation args when instantiating"""
o = object.__new__(cls)
o._captured_init_args = args
o._captured_init_kwargs = kwargs
return o
@property
def classname(self):
return classname(self)
def __dask_tokenize__(self):
if self.__tok_cache is None:
from dask.base import tokenize
self.__tok_cache = tokenize(self.__getstate__())
return self.__tok_cache
def __getstate__(self):
args = [arg.__getstate__() if isinstance(arg, DictSerialiseMixin) else arg for arg in self._captured_init_args]
# We employ OrderedDict in several places. The motivation
# is to speed up dask tokenization. When dask tokenizes a plain dict,
# it sorts the keys, and it turns out that this sort operation
# dominates the call time, even for very small dicts. Using an
# OrderedDict steers dask toward a different and faster tokenization.
kwargs = collections.OrderedDict({k: arg.__getstate__() if isinstance(arg, DictSerialiseMixin) else arg for k, arg in self._captured_init_kwargs.items()})
return collections.OrderedDict(cls=self.classname, args=args, kwargs=kwargs)
def __setstate__(self, state):
# reconstitute instances here
self._captured_init_kwargs = state["kwargs"]
self._captured_init_args = state["args"]
state.pop("cls", None)
self.__init__(*state["args"], **state["kwargs"])
def __hash__(self):
from dask.base import tokenize
return int(tokenize(self), 16)
def __eq__(self, other):
return hash(self) == hash(other)
def remake_instance(data):
import importlib
if isinstance(data, str):
data = {"cls": data}
else:
data = data.copy()
mod, klass = data.pop("cls").rsplit(".", 1)
module = importlib.import_module(mod)
cl = getattr(module, klass)
return cl(*data.get("args", ()), **data.get("kwargs", {}))
def pretty_describe(object, nestedness=0, indent=2):
"""Maintain dict ordering - but make string version prettier"""
if not isinstance(object, dict):
return str(object)
sep = f'\n{" " * nestedness * indent}'
out = sep.join((f"{k}: {pretty_describe(v, nestedness + 1)}" for k, v in object.items()))
if nestedness > 0 and out:
return f"{sep}{out}"
return out
def decode_datetime(obj):
import numpy
if not isinstance(obj, numpy.ndarray) and "__datetime__" in obj:
try:
obj = datetime.datetime.strptime(
obj["as_str"],
"%Y%m%dT%H:%M:%S.%f%z",
)
except ValueError: # Perhaps lacking tz info
obj = datetime.datetime.strptime(
obj["as_str"],
"%Y%m%dT%H:%M:%S.%f",
)
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
return {"__datetime__": True, "as_str": obj.strftime("%Y%m%dT%H:%M:%S.%f%z")}
return obj
class RegistryView(collections.abc.Mapping):
"""
Wrap registry dict in a read-only dict view.
Subclasses define attributes filled into warning and error messages:
- self._registry_name
- self._register_func_name
- self._unregister_func_name
"""
def __init__(self, registry):
self._registry = registry
def __repr__(self):
return f"{self.__class__.__name__}({self._registry!r})"
def __getitem__(self, key):
return self._registry[key]
def __iter__(self):
yield from self._registry
def __len__(self):
return len(self._registry)
# Support the common mutation methods for now, but warn.
def update(self, *args, **kwargs):
warnings.warn(
f"In a future release of intake, the {self._registry_name} will " f"not be directly mutable. Use {self._register_func_name}.", DeprecationWarning
)
self._registry.update(*args, **kwargs)
# raise TypeError(
# f"The registry cannot be edited directly. "
# f"Instead, use the {self._register_func_name{")
def __setitem__(self, key, value):
warnings.warn(
f"In a future release of intake, the {self._registry_name} will " f"not be directly mutable. Use {self._register_func_name}.", DeprecationWarning
)
self._registry[key] = value
# raise TypeError(
# f"The registry cannot be edited directly. "
# f"Instead, use the {self._register_func_name{")
def __delitem__(self, key):
warnings.warn(
f"In a future release of intake, the {self._registry_name} will " f"not be directly mutable. Use {self._unregister_func_name}.", DeprecationWarning
)
del self._registry[key]
# raise TypeError(
# f"The registry cannot be edited directly. "
# f"Instead, use the {self._unregister_func_name{")
class DriverRegistryView(RegistryView):
# This attributes are used by the base class
# to fill in warning and error messages.
_registry_name = "intake.registry"
_register_func_name = "intake.register_driver"
_unregister_func_name = "intake.unregister_driver"
class ContainerRegistryView(RegistryView):
# This attributes are used by the base class
# to fill in warning and error messages.
_registry_name = "intake.container_map"
_register_func_name = "intake.register_container"
_unregister_func_name = "intake.unregister_container"
class ModuleImporter:
def __init__(self, destination):
self.destination = destination
self.module = None
def __getattribute__(self, item):
d = object.__getattribute__(self, "__dict__")
if item in d:
return d[item]
if self.module is None:
print("Importing module: ", self.destination)
self.module = __import__(self.destination)
else:
print("Referencing module: ", self.destination)
sys.modules[self.destination] = self.module
return getattr(self.module, item)
def is_notebook() -> bool:
"""Check if code is running in a notebook
Copied from tqdm.autonotebook
Returns
-------
bool
True if inside a notebook. False if not inside a notebook.
"""
try:
get_ipython = sys.modules["IPython"].get_ipython
if "IPKernelApp" not in get_ipython().config: # pragma: no cover
raise ImportError("console")
if "VSCODE_PID" in os.environ: # pragma: no cover
raise ImportError("vscode")
return True
except Exception:
return False