Skip to content

Commit

Permalink
Merge pull request #358 from python-adaptive/as-pandas-dataframe
Browse files Browse the repository at this point in the history
Add getting learner's data as pandas.DataFrame; add learner.to_dataframe method
  • Loading branch information
basnijholt committed Sep 19, 2022
2 parents c59405a + e091772 commit 21fb3b6
Show file tree
Hide file tree
Showing 16 changed files with 1,034 additions and 18 deletions.
92 changes: 89 additions & 3 deletions adaptive/learner/average_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,20 @@

from adaptive.learner.base_learner import BaseLearner
from adaptive.notebook_integration import ensure_holoviews
from adaptive.types import Float, Real
from adaptive.utils import cache_latest
from adaptive.types import Float, Int, Real
from adaptive.utils import (
assign_defaults,
cache_latest,
partial_function_from_dataframe,
)

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False


class AverageLearner(BaseLearner):
Expand Down Expand Up @@ -70,6 +82,80 @@ def to_numpy(self):
"""Data as NumPy array of size (npoints, 2) with seeds and values."""
return np.array(sorted(self.data.items()))

def to_dataframe(
self,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
y_name: str = "y",
) -> pandas.DataFrame:
"""Return the data as a `pandas.DataFrame`.
Parameters
----------
with_default_function_args : bool, optional
Include the ``learner.function``'s default arguments as a
column, by default True
function_prefix : str, optional
Prefix to the ``learner.function``'s default arguments' names,
by default "function."
seed_name : str, optional
Name of the ``seed`` parameter, by default "seed"
y_name : str, optional
Name of the output value, by default "y"
Returns
-------
pandas.DataFrame
Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
df = pandas.DataFrame(sorted(self.data.items()), columns=[seed_name, y_name])
df.attrs["inputs"] = [seed_name]
df.attrs["output"] = y_name
if with_default_function_args:
assign_defaults(self.function, df, function_prefix)
return df

def load_dataframe(
self,
df: pandas.DataFrame,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
y_name: str = "y",
):
"""Load data from a `pandas.DataFrame`.
If ``with_default_function_args`` is True, then ``learner.function``'s
default arguments are set (using `functools.partial`) from the values
in the `pandas.DataFrame`.
Parameters
----------
df : pandas.DataFrame
The data to load.
with_default_function_args : bool, optional
The ``with_default_function_args`` used in ``to_dataframe()``,
by default True
function_prefix : str, optional
The ``function_prefix`` used in ``to_dataframe``, by default "function."
seed_name : str, optional
The ``seed_name`` used in ``to_dataframe``, by default "seed"
y_name : str, optional
The ``y_name`` used in ``to_dataframe``, by default "y"
"""
self.tell_many(df[seed_name].values, df[y_name].values)
if with_default_function_args:
self.function = partial_function_from_dataframe(
self.function, df, function_prefix
)

def ask(self, n: int, tell_pending: bool = True) -> tuple[list[int], list[Float]]:
points = list(range(self.n_requested, self.n_requested + n))

Expand All @@ -87,7 +173,7 @@ def ask(self, n: int, tell_pending: bool = True) -> tuple[list[int], list[Float]
self.tell_pending(p)
return points, loss_improvements

def tell(self, n: int, value: Real) -> None:
def tell(self, n: Int, value: Real) -> None:
if n in self.data:
# The point has already been added before.
return
Expand Down
123 changes: 120 additions & 3 deletions adaptive/learner/average_learner1D.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@

from adaptive.learner.learner1D import Learner1D, _get_intervals
from adaptive.notebook_integration import ensure_holoviews
from adaptive.types import Real
from adaptive.types import Int, Real
from adaptive.utils import assign_defaults, partial_function_from_dataframe

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False

Point = Tuple[int, Real]
Points = List[Point]
Expand Down Expand Up @@ -127,6 +136,112 @@ def min_samples_per_point(self) -> int:
return 0
return min(self._number_samples.values())

def to_numpy(self, mean: bool = False) -> np.ndarray:
if mean:
return super().to_numpy()
else:
return np.array(
[
(seed, x, *np.atleast_1d(y))
for x, seed_y in self._data_samples.items()
for seed, y in seed_y.items()
]
)

def to_dataframe(
self,
mean: bool = False,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
x_name: str = "x",
y_name: str = "y",
) -> pandas.DataFrame:
"""Return the data as a `pandas.DataFrame`.
Parameters
----------
with_default_function_args : bool, optional
Include the ``learner.function``'s default arguments as a
column, by default True
function_prefix : str, optional
Prefix to the ``learner.function``'s default arguments' names,
by default "function."
seed_name : str, optional
Name of the ``seed`` parameter, by default "seed"
x_name : str, optional
Name of the ``x`` parameter, by default "x"
y_name : str, optional
Name of the output value, by default "y"
Returns
-------
pandas.DataFrame
Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
if mean:
data = sorted(self.data.items())
columns = [x_name, y_name]
else:
data = [
(seed, x, y)
for x, seed_y in sorted(self._data_samples.items())
for seed, y in sorted(seed_y.items())
]
columns = [seed_name, x_name, y_name]
df = pandas.DataFrame(data, columns=columns)
df.attrs["inputs"] = [seed_name, x_name]
df.attrs["output"] = y_name
if with_default_function_args:
assign_defaults(self.function, df, function_prefix)
return df

def load_dataframe(
self,
df: pandas.DataFrame,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
x_name: str = "x",
y_name: str = "y",
):
"""Load data from a `pandas.DataFrame`.
If ``with_default_function_args`` is True, then ``learner.function``'s
default arguments are set (using `functools.partial`) from the values
in the `pandas.DataFrame`.
Parameters
----------
df : pandas.DataFrame
The data to load.
with_default_function_args : bool, optional
The ``with_default_function_args`` used in ``to_dataframe()``,
by default True
function_prefix : str, optional
The ``function_prefix`` used in ``to_dataframe``, by default "function."
seed_name : str, optional
The ``seed_name`` used in ``to_dataframe``, by default "seed"
x_name : str, optional
The ``x_name`` used in ``to_dataframe``, by default "x"
y_name : str, optional
The ``y_name`` used in ``to_dataframe``, by default "y"
"""
# Were using zip instead of df[[seed_name, x_name]].values because that will
# make the seeds into floats
seed_x = list(zip(df[seed_name].values.tolist(), df[x_name].values.tolist()))
self.tell_many(seed_x, df[y_name].values)
if with_default_function_args:
self.function = partial_function_from_dataframe(
self.function, df, function_prefix
)

def ask(self, n: int, tell_pending: bool = True) -> tuple[Points, list[float]]:
"""Return 'n' points that are expected to maximally reduce the loss."""
# If some point is undersampled, resample it
Expand Down Expand Up @@ -362,7 +477,9 @@ def _calc_error_in_mean(self, ys: Iterable[Real], y_avg: Real, n: int) -> float:
t_student = scipy.stats.t.ppf(1 - self.alpha, df=n - 1)
return t_student * (variance_in_mean / n) ** 0.5

def tell_many(self, xs: Points, ys: Sequence[Real]) -> None:
def tell_many(
self, xs: Points | np.ndarray, ys: Sequence[Real] | np.ndarray
) -> None:
# Check that all x are within the bounds
# TODO: remove this requirement, all other learners add the data
# but ignore it going forward.
Expand All @@ -373,7 +490,7 @@ def tell_many(self, xs: Points, ys: Sequence[Real]) -> None:
)

# Create a mapping of points to a list of samples
mapping: DefaultDict[Real, DefaultDict[int, Real]] = defaultdict(
mapping: DefaultDict[Real, DefaultDict[Int, Real]] = defaultdict(
lambda: defaultdict(dict)
)
for (seed, x), y in zip(xs, ys):
Expand Down
57 changes: 57 additions & 0 deletions adaptive/learner/balancing_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
from adaptive.notebook_integration import ensure_holoviews
from adaptive.utils import cache_latest, named_product, restore

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False


def dispatch(child_functions, arg):
index, x = arg
Expand Down Expand Up @@ -381,6 +389,55 @@ def from_product(cls, f, learner_type, learner_kwargs, combos):
learners.append(learner)
return cls(learners, cdims=arguments)

def to_dataframe(self, index_name: str = "learner_index", **kwargs):
"""Return the data as a concatenated `pandas.DataFrame` from child learners.
Parameters
----------
index_name : str, optional
The name of the index column indicating the learner index,
by default "learner_index".
**kwargs : dict
Keyword arguments passed to each ``child_learner.to_dataframe(**kwargs)``.
Returns
-------
pandas.DataFrame
Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
dfs = []
for i, learner in enumerate(self.learners):
df = learner.to_dataframe(**kwargs)
cols = list(df.columns)
df[index_name] = i
df = df[[index_name] + cols]
dfs.append(df)
df = pandas.concat(dfs, axis=0, ignore_index=True)
return df

def load_dataframe(
self, df: pandas.DataFrame, index_name: str = "learner_index", **kwargs
):
"""Load the data from a `pandas.DataFrame` into the child learners.
Parameters
----------
df : pandas.DataFrame
DataFrame with the data to load.
index_name : str, optional
The ``index_name`` used in `to_dataframe`, by default "learner_index".
**kwargs : dict
Keyword arguments passed to each ``child_learner.load_dataframe(**kwargs)``.
"""
for i, gr in df.groupby(index_name):
self.learners[i].load_dataframe(gr, **kwargs)

def save(self, fname, compress=True):
"""Save the data of the child learners into pickle files
in a directory.
Expand Down

0 comments on commit 21fb3b6

Please sign in to comment.