Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: parse . in formulas #28

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
75 changes: 67 additions & 8 deletions patsy/desc.py
Expand Up @@ -7,10 +7,10 @@
# a formula parse tree (from patsy.parse_formula) into a ModelDesc.

from patsy import PatsyError
from patsy.parse_formula import ParseNode, Token, parse_formula
from patsy.parse_formula import ParseNode, parse_formula
from patsy.eval import EvalEnvironment, EvalFactor
from patsy.util import uniqueify_list
from patsy.util import repr_pretty_delegate, repr_pretty_impl
from patsy.util import (is_valid_python_varname, uniqueify_list,
repr_pretty_delegate, repr_pretty_impl)

# These are made available in the patsy.* namespace
__all__ = ["Term", "ModelDesc", "INTERCEPT"]
Expand Down Expand Up @@ -149,7 +149,8 @@ def term_code(term):
return result

@classmethod
def from_formula(cls, tree_or_string, factor_eval_env):
def from_formula(cls, tree_or_string, factor_eval_env,
data_iter_maker=(lambda: [{}])):
"""Construct a :class:`ModelDesc` from a formula string.

:arg tree_or_string: A formula string. (Or an unevaluated formula
Expand All @@ -158,14 +159,21 @@ def from_formula(cls, tree_or_string, factor_eval_env):
:arg factor_eval_env: A :class:`EvalEnvironment`, to be used for
constructing :class:`EvalFactor` objects while parsing this
formula.
:arg data_iter_maker: A zero-argument callable which returns an iterator
over dict-like data objects. This must be a callable rather than a
simple iterator because sufficiently complex formulas may require
multiple passes over the data (e.g. if there are nested stateful
transforms). Only required if ``.`` is used in your formula string
to indicate "all unused variables."
:returns: A new :class:`ModelDesc`.
"""
if isinstance(tree_or_string, ParseNode):
tree = tree_or_string
else:
tree = parse_formula(tree_or_string)
factor_eval_env.add_outer_namespace(_builtins_dict)
value = Evaluator(factor_eval_env).eval(tree, require_evalexpr=False)
value = Evaluator(factor_eval_env, data_iter_maker).eval(
tree, require_evalexpr=False)
assert isinstance(value, cls)
return value

Expand Down Expand Up @@ -242,7 +250,6 @@ def _eval_binary_plus(evaluator, tree):
left_expr.intercept_removed,
left_expr.terms + right_expr.terms)


def _eval_binary_minus(evaluator, tree):
left_expr = evaluator.eval(tree.args[0])
if tree.args[1].type == "ZERO":
Expand Down Expand Up @@ -355,12 +362,46 @@ def _eval_number(evaluator, tree):
def _eval_python_expr(evaluator, tree):
factor = EvalFactor(tree.token.extra, evaluator._factor_eval_env,
origin=tree.origin)
evaluator._evaluated_exprs.add(tree.token.extra)
return IntermediateExpr(False, None, False, [Term([factor])])

def _generate_data_codes(data_iter_maker):
for data in data_iter_maker():
for name in data:
if is_valid_python_varname(name):
code = name
elif isinstance(name, str):
code = "Q('%s')" % name.encode("string_escape")
else:
# possibly should silently ignore non-string names instead
code = "Q(%s)" % name
yield code

def test__generate_data_codes():
assert (list(_generate_data_codes(lambda: [['a', 'b.c', 1]]))
== ["a", "Q('b.c')", "Q(1)"])

def _eval_dot(evaluator, tree):
codes = list(_generate_data_codes(evaluator.data_iter_maker))
if not codes:
raise PatsyError("Formulas only support '.' if supplied with data",
tree)
terms = []
for code in codes:
if code not in evaluator._evaluated_exprs:
factor = EvalFactor(code, evaluator._factor_eval_env,
origin=tree.origin)
terms.append(Term([factor]))
return IntermediateExpr(False, None, False, terms)

class Evaluator(object):
def __init__(self, factor_eval_env):
def __init__(self, factor_eval_env, data_iter_maker=(lambda: [{}])):
self._evaluators = {}
self._factor_eval_env = factor_eval_env

self.data_iter_maker = data_iter_maker
self._evaluated_exprs = set()

self.add_op("~", 2, _eval_any_tilde)
self.add_op("~", 1, _eval_any_tilde)

Expand All @@ -378,6 +419,7 @@ def __init__(self, factor_eval_env):
self.add_op("ONE", 0, _eval_one)
self.add_op("NUMBER", 0, _eval_number)
self.add_op("PYTHON_EXPR", 0, _eval_python_expr)
self.add_op("DOT", 0, _eval_dot)

# Not used by Patsy -- provided for the convenience of eventual
# user-defined operators.
Expand Down Expand Up @@ -447,6 +489,19 @@ def eval(self, tree, require_evalexpr=True):
"a + np.log(a, base=10)": (True, ["a", "np.log(a, base=10)"]),
# Note different spacing:
"a + np.log(a, base=10) - np . log(a , base = 10)": (True, ["a"]),

".": (True, ["a", "b", "c"]),
"a + .": (True, ["a", "b", "c"]),
"a + b + c + .": (True, ["a", "b", "c"]),
"I(a) + .": (True, ["I(a)", "a", "b", "c"]),
"a:.": (True, [("a", "b"), ("a", "c")]),
"a*.": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]),
". - c": (True, ["a", "b"]),
". - (. - a)": (True, ["a"]),
". + a:.": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]),
"a:. + .": (True, [("a", "b"), ("a", "c"), "b", "c"]),
"a ~ .": (False, ["a"], True, ["b", "c"]),
". ~ b + c": (False, ["a", "b", "c"], True, ["b", "c"]),

"a + (I(b) + c)": (True, ["a", "I(b)", "c"]),
"a + I(b + c)": (True, ["a", "I(b + c)"]),
Expand Down Expand Up @@ -580,6 +635,8 @@ def eval(self, tree, require_evalexpr=True):

"<- a>",
"a + <-a**2>",

"a + <.>",
]

def _assert_terms_match(terms, expected_intercept, expecteds, eval_env): # pragma: no cover
Expand All @@ -600,7 +657,9 @@ def _do_eval_formula_tests(tests): # pragma: no cover
if len(result) == 2:
result = (False, []) + result
eval_env = EvalEnvironment.capture(0)
model_desc = ModelDesc.from_formula(code, eval_env)
def data_iter_maker():
return [['a', 'b', 'c']]
model_desc = ModelDesc.from_formula(code, eval_env, data_iter_maker)
print repr(code)
print result
print model_desc
Expand Down
3 changes: 2 additions & 1 deletion patsy/highlevel.py
Expand Up @@ -48,7 +48,8 @@ def _try_incr_builders(formula_like, data_iter_maker, eval_env,
# fallthrough
if isinstance(formula_like, basestring):
assert isinstance(eval_env, EvalEnvironment)
formula_like = ModelDesc.from_formula(formula_like, eval_env)
formula_like = ModelDesc.from_formula(formula_like, eval_env,
data_iter_maker)
# fallthrough
if isinstance(formula_like, ModelDesc):
return design_matrix_builders([formula_like.lhs_termlist,
Expand Down
4 changes: 3 additions & 1 deletion patsy/parse_formula.py
Expand Up @@ -19,7 +19,7 @@
from patsy.tokens import python_tokenize, pretty_untokenize
from patsy.util import PushbackAdapter

_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"]
_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "DOT", "NUMBER"]

def _is_a(f, v):
try:
Expand Down Expand Up @@ -58,6 +58,8 @@ def _read_python_expr(it, end_tokens):
token_type = "ZERO"
elif expr_text == "1":
token_type = "ONE"
elif expr_text == ".":
token_type = "DOT"
elif _is_a(int, expr_text) or _is_a(float, expr_text):
token_type = "NUMBER"
else:
Expand Down
6 changes: 6 additions & 0 deletions patsy/test_highlevel.py
Expand Up @@ -257,6 +257,12 @@ def __patsy_get_model_desc__(self, data):
True,
[[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

# . in formulas
t("y + .", {"y": [1, 2], "x": [3, 4]}, 0,
True,
[[1, 1, 3], [1, 2, 4]], ["Intercept", "y", "x"])
t_invalid('~ .', {}, 0)

# ModelDesc
desc = ModelDesc([], [Term([LookupFactor("x")])])
t(desc, {"x": [1.5, 2.5, 3.5]}, 0,
Expand Down
25 changes: 20 additions & 5 deletions patsy/util.py
Expand Up @@ -4,13 +4,13 @@

# Some generic utilities.

__all__ = ["atleast_2d_column_default", "uniqueify_list",
"widest_float", "widest_complex", "wide_dtype_for", "widen",
"repr_pretty_delegate", "repr_pretty_impl",
"SortAnythingKey", "safe_scalar_isnan", "safe_isnan",
"iterable",
__all__ = ["atleast_2d_column_default", "is_valid_python_varname",
"uniqueify_list", "widest_float", "widest_complex", "wide_dtype_for",
"widen", "repr_pretty_delegate", "repr_pretty_impl",
"SortAnythingKey", "safe_scalar_isnan", "safe_isnan", "iterable",
]

import re
import sys
import numpy as np
from cStringIO import StringIO
Expand Down Expand Up @@ -185,6 +185,21 @@ def test_atleast_2d_column_default():
finally:
have_pandas = had_pandas

def is_valid_python_varname(name):
# see: http://stackoverflow.com/a/10134719/809705
return (isinstance(name, str) and
re.match(r"^[^\d\W]\w*\Z", name) is not None)

def test_is_valid_python_varname():
tests = {"a": True,
"a1": True,
"_a1": True,
"1a": False,
"a.b": False,
1: False}
for k, v in tests.iteritems():
assert is_valid_python_varname(k) == v

# A version of .reshape() that knows how to down-convert a 1-column
# pandas.DataFrame into a pandas.Series. Useful for code that wants to be
# agnostic between 1d and 2d data, with the pattern:
Expand Down