diff --git a/patsy/desc.py b/patsy/desc.py index 86f7bca..f711124 100644 --- a/patsy/desc.py +++ b/patsy/desc.py @@ -7,10 +7,10 @@ # a formula parse tree (from patsy.parse_formula) into a ModelDesc. from patsy import PatsyError -from patsy.parse_formula import ParseNode, Token, parse_formula +from patsy.parse_formula import ParseNode, parse_formula from patsy.eval import EvalEnvironment, EvalFactor -from patsy.util import uniqueify_list -from patsy.util import repr_pretty_delegate, repr_pretty_impl +from patsy.util import (is_valid_python_varname, uniqueify_list, + repr_pretty_delegate, repr_pretty_impl) # These are made available in the patsy.* namespace __all__ = ["Term", "ModelDesc", "INTERCEPT"] @@ -149,7 +149,8 @@ def term_code(term): return result @classmethod - def from_formula(cls, tree_or_string, factor_eval_env): + def from_formula(cls, tree_or_string, factor_eval_env, + data_iter_maker=(lambda: [{}])): """Construct a :class:`ModelDesc` from a formula string. :arg tree_or_string: A formula string. (Or an unevaluated formula @@ -158,6 +159,12 @@ def from_formula(cls, tree_or_string, factor_eval_env): :arg factor_eval_env: A :class:`EvalEnvironment`, to be used for constructing :class:`EvalFactor` objects while parsing this formula. + :arg data_iter_maker: A zero-argument callable which returns an iterator + over dict-like data objects. This must be a callable rather than a + simple iterator because sufficiently complex formulas may require + multiple passes over the data (e.g. if there are nested stateful + transforms). Only required if ``.`` is used in your formula string + to indicate "all unused variables." :returns: A new :class:`ModelDesc`. """ if isinstance(tree_or_string, ParseNode): @@ -165,7 +172,8 @@ def from_formula(cls, tree_or_string, factor_eval_env): else: tree = parse_formula(tree_or_string) factor_eval_env.add_outer_namespace(_builtins_dict) - value = Evaluator(factor_eval_env).eval(tree, require_evalexpr=False) + value = Evaluator(factor_eval_env, data_iter_maker).eval( + tree, require_evalexpr=False) assert isinstance(value, cls) return value @@ -242,7 +250,6 @@ def _eval_binary_plus(evaluator, tree): left_expr.intercept_removed, left_expr.terms + right_expr.terms) - def _eval_binary_minus(evaluator, tree): left_expr = evaluator.eval(tree.args[0]) if tree.args[1].type == "ZERO": @@ -355,12 +362,46 @@ def _eval_number(evaluator, tree): def _eval_python_expr(evaluator, tree): factor = EvalFactor(tree.token.extra, evaluator._factor_eval_env, origin=tree.origin) + evaluator._evaluated_exprs.add(tree.token.extra) return IntermediateExpr(False, None, False, [Term([factor])]) +def _generate_data_codes(data_iter_maker): + for data in data_iter_maker(): + for name in data: + if is_valid_python_varname(name): + code = name + elif isinstance(name, str): + code = "Q('%s')" % name.encode("string_escape") + else: + # possibly should silently ignore non-string names instead + code = "Q(%s)" % name + yield code + +def test__generate_data_codes(): + assert (list(_generate_data_codes(lambda: [['a', 'b.c', 1]])) + == ["a", "Q('b.c')", "Q(1)"]) + +def _eval_dot(evaluator, tree): + codes = list(_generate_data_codes(evaluator.data_iter_maker)) + if not codes: + raise PatsyError("Formulas only support '.' if supplied with data", + tree) + terms = [] + for code in codes: + if code not in evaluator._evaluated_exprs: + factor = EvalFactor(code, evaluator._factor_eval_env, + origin=tree.origin) + terms.append(Term([factor])) + return IntermediateExpr(False, None, False, terms) + class Evaluator(object): - def __init__(self, factor_eval_env): + def __init__(self, factor_eval_env, data_iter_maker=(lambda: [{}])): self._evaluators = {} self._factor_eval_env = factor_eval_env + + self.data_iter_maker = data_iter_maker + self._evaluated_exprs = set() + self.add_op("~", 2, _eval_any_tilde) self.add_op("~", 1, _eval_any_tilde) @@ -378,6 +419,7 @@ def __init__(self, factor_eval_env): self.add_op("ONE", 0, _eval_one) self.add_op("NUMBER", 0, _eval_number) self.add_op("PYTHON_EXPR", 0, _eval_python_expr) + self.add_op("DOT", 0, _eval_dot) # Not used by Patsy -- provided for the convenience of eventual # user-defined operators. @@ -447,6 +489,19 @@ def eval(self, tree, require_evalexpr=True): "a + np.log(a, base=10)": (True, ["a", "np.log(a, base=10)"]), # Note different spacing: "a + np.log(a, base=10) - np . log(a , base = 10)": (True, ["a"]), + + ".": (True, ["a", "b", "c"]), + "a + .": (True, ["a", "b", "c"]), + "a + b + c + .": (True, ["a", "b", "c"]), + "I(a) + .": (True, ["I(a)", "a", "b", "c"]), + "a:.": (True, [("a", "b"), ("a", "c")]), + "a*.": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]), + ". - c": (True, ["a", "b"]), + ". - (. - a)": (True, ["a"]), + ". + a:.": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]), + "a:. + .": (True, [("a", "b"), ("a", "c"), "b", "c"]), + "a ~ .": (False, ["a"], True, ["b", "c"]), + ". ~ b + c": (False, ["a", "b", "c"], True, ["b", "c"]), "a + (I(b) + c)": (True, ["a", "I(b)", "c"]), "a + I(b + c)": (True, ["a", "I(b + c)"]), @@ -580,6 +635,8 @@ def eval(self, tree, require_evalexpr=True): "<- a>", "a + <-a**2>", + + "a + <.>", ] def _assert_terms_match(terms, expected_intercept, expecteds, eval_env): # pragma: no cover @@ -600,7 +657,9 @@ def _do_eval_formula_tests(tests): # pragma: no cover if len(result) == 2: result = (False, []) + result eval_env = EvalEnvironment.capture(0) - model_desc = ModelDesc.from_formula(code, eval_env) + def data_iter_maker(): + return [['a', 'b', 'c']] + model_desc = ModelDesc.from_formula(code, eval_env, data_iter_maker) print repr(code) print result print model_desc diff --git a/patsy/highlevel.py b/patsy/highlevel.py index 2d70179..b5e7dfa 100644 --- a/patsy/highlevel.py +++ b/patsy/highlevel.py @@ -48,7 +48,8 @@ def _try_incr_builders(formula_like, data_iter_maker, eval_env, # fallthrough if isinstance(formula_like, basestring): assert isinstance(eval_env, EvalEnvironment) - formula_like = ModelDesc.from_formula(formula_like, eval_env) + formula_like = ModelDesc.from_formula(formula_like, eval_env, + data_iter_maker) # fallthrough if isinstance(formula_like, ModelDesc): return design_matrix_builders([formula_like.lhs_termlist, diff --git a/patsy/parse_formula.py b/patsy/parse_formula.py index 8806ad2..35aee83 100644 --- a/patsy/parse_formula.py +++ b/patsy/parse_formula.py @@ -19,7 +19,7 @@ from patsy.tokens import python_tokenize, pretty_untokenize from patsy.util import PushbackAdapter -_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"] +_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "DOT", "NUMBER"] def _is_a(f, v): try: @@ -58,6 +58,8 @@ def _read_python_expr(it, end_tokens): token_type = "ZERO" elif expr_text == "1": token_type = "ONE" + elif expr_text == ".": + token_type = "DOT" elif _is_a(int, expr_text) or _is_a(float, expr_text): token_type = "NUMBER" else: diff --git a/patsy/test_highlevel.py b/patsy/test_highlevel.py index cb4a1e5..ce01e27 100644 --- a/patsy/test_highlevel.py +++ b/patsy/test_highlevel.py @@ -257,6 +257,12 @@ def __patsy_get_model_desc__(self, data): True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) + # . in formulas + t("y + .", {"y": [1, 2], "x": [3, 4]}, 0, + True, + [[1, 1, 3], [1, 2, 4]], ["Intercept", "y", "x"]) + t_invalid('~ .', {}, 0) + # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, diff --git a/patsy/util.py b/patsy/util.py index 8b4e82e..f6bc635 100644 --- a/patsy/util.py +++ b/patsy/util.py @@ -4,13 +4,13 @@ # Some generic utilities. -__all__ = ["atleast_2d_column_default", "uniqueify_list", - "widest_float", "widest_complex", "wide_dtype_for", "widen", - "repr_pretty_delegate", "repr_pretty_impl", - "SortAnythingKey", "safe_scalar_isnan", "safe_isnan", - "iterable", +__all__ = ["atleast_2d_column_default", "is_valid_python_varname", + "uniqueify_list", "widest_float", "widest_complex", "wide_dtype_for", + "widen", "repr_pretty_delegate", "repr_pretty_impl", + "SortAnythingKey", "safe_scalar_isnan", "safe_isnan", "iterable", ] +import re import sys import numpy as np from cStringIO import StringIO @@ -185,6 +185,21 @@ def test_atleast_2d_column_default(): finally: have_pandas = had_pandas +def is_valid_python_varname(name): + # see: http://stackoverflow.com/a/10134719/809705 + return (isinstance(name, str) and + re.match(r"^[^\d\W]\w*\Z", name) is not None) + +def test_is_valid_python_varname(): + tests = {"a": True, + "a1": True, + "_a1": True, + "1a": False, + "a.b": False, + 1: False} + for k, v in tests.iteritems(): + assert is_valid_python_varname(k) == v + # A version of .reshape() that knows how to down-convert a 1-column # pandas.DataFrame into a pandas.Series. Useful for code that wants to be # agnostic between 1d and 2d data, with the pattern: