Skip to content

Commit

Permalink
ENH: treat objects as factors. sympy 0.7.0 fixes, etc
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jul 16, 2011
1 parent 8384879 commit acd7fa8
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 34 deletions.
46 changes: 23 additions & 23 deletions formula/formula.py
Expand Up @@ -60,7 +60,7 @@
30 1 82 59 4838
attr(,"assign")
[1] 0 1 2 3
>
>
With the Formula, it looks like this:
Expand Down Expand Up @@ -135,7 +135,7 @@ class Formula(object):
The expressions may depend on additional Symbol instances,
giving a non-linear regression model.
"""
# This flag is defined to avoid using isinstance
# This flag is defined to avoid using isinstance
_formula_flag = True

def __init__(self, seq, char = 'b'):
Expand Down Expand Up @@ -175,7 +175,7 @@ def _getterms(self):

def _getmean(self):
""" Expression for mean
Expression for the mean, expressed as a linear combination of
terms, each with dummy variables in front.
"""
Expand All @@ -200,9 +200,9 @@ def params(self):
return getparams(self.mean)

def _getdiff(self):
p = list(set(getparams(self.mean)))
p.sort()
return [s.doit() for s in sympy.diff(self.mean, p)]
params = list(set(getparams(self.mean)))
params.sort()
return [sympy.diff(self.mean, p).doit() for p in params]
design_expr = property(_getdiff)

def _getdtype(self):
Expand Down Expand Up @@ -240,7 +240,7 @@ def subs(self, old, new):
The expression to be changed
new : sympy.Basic
The value to change it to.
Returns
-------
newf : Formula
Expand All @@ -267,14 +267,14 @@ def delete_terms(self, other):
if t in l1:
l1.remove(t)
return Formula(l1)

def __repr__(self):
return """Formula(%s)""" % `list(self.terms)`

def __add__(self, other):
"""
Create a new Formula by combining terms
of other with those of self.
of other with those of self.
>>> x, y, z = [Term(l) for l in 'xyz']
>>> f1 = Formula([x,y,z])
Expand All @@ -286,7 +286,7 @@ def __add__(self, other):
[1, y]
>>> sorted(f3.terms)
[1, x, y, z]
>>>
>>>
"""

if hasattr(other, 'formula'):
Expand All @@ -302,7 +302,7 @@ def __add__(self, other):
def __mul__(self, other):
"""
Create a new Formula by combining terms
of other with those of self.
of other with those of self.
>>> x, y, z = [Term(l) for l in 'xyz']
>>> f1 = Formula([x,y,z])
Expand All @@ -314,7 +314,7 @@ def __mul__(self, other):
[1, y]
>>> sorted(f3.terms)
[1, x, y, z]
>>>
>>>
"""

if hasattr(other, 'formula'):
Expand Down Expand Up @@ -357,8 +357,8 @@ def _setup_design(self):
# Using the random offset will minimize the possibility
# of this happening.

# This renaming is here principally because of the
# intercept.
# This renaming is here principally because of the
# intercept.

random_offset = np.random.random_integers(low=0, high=2**30)

Expand Down Expand Up @@ -391,16 +391,16 @@ def _setup_design(self):
# the natural splines, etc. You can represent natural splines
# with sympy but the expression is pretty awful.

_namespace = {};
_namespace = {};
_add_aliases_to_namespace(_namespace, *d)

self._f = sympy.lambdify(newparams + newterms, d, (_namespace, "numpy"))

# The input to self.design will be a recarray of that must
# The input to self.design will be a recarray of that must
# have field names that the Formula will expect to see.
# However, if any of self.terms are FactorTerms, then the field
# in the recarray will not actually be in the Term.
#
#
# For example, if there is a Factor 'f' with levels ['a','b'],
# there will be terms 'f_a' and 'f_b', though the input to
# design will have a field named 'f'. In this sense,
Expand All @@ -418,7 +418,7 @@ def _setup_design(self):
preterm = list(set(preterm))

# There is also an argument for parameters that are not
# Terms.
# Terms.

self._dtypes = {'param':np.dtype([(str(p), np.float) for p in params]),
'term':np.dtype([(str(t), np.float) for t in terms]),
Expand Down Expand Up @@ -480,7 +480,7 @@ def design(self,
# The term_recarray is essentially the same as preterm_recarray,
# except that all factors in self are expanded
# into their respective binary columns.
term_recarray = np.zeros(preterm_recarray.shape[0],
term_recarray = np.zeros(preterm_recarray.shape[0],
dtype=self._dtypes['term'])
for t in self.__terms:
if not is_factor_term(t):
Expand Down Expand Up @@ -508,7 +508,7 @@ def design(self,
# I think it is because the lambda evaluates sympy.Number(1) to 1
# and not an array.
D_tuple = [np.asarray(w) for w in D]

need_to_modify_shape = []
OK_row_shapes = []
for i, row in enumerate(D_tuple):
Expand Down Expand Up @@ -554,7 +554,7 @@ def design(self,
for key, cf in contrasts.items():
if not is_formula(cf):
cf = Formula([cf])
L = cf.design(input, param=param_recarray,
L = cf.design(input, param=param_recarray,
return_float=True)
cmatrices[key] = contrast_from_cols_or_rows(L, _D, pseudo=pinvD)
return D, cmatrices
Expand Down Expand Up @@ -584,7 +584,7 @@ def is_formula(obj):


def getparams(expression):
""" Return the parameters of an expression that are not Term
""" Return the parameters of an expression that are not Term
instances but are instances of sympy.Symbol.
Examples
Expand All @@ -597,7 +597,7 @@ def getparams(expression):
_b0*x + _b1*y + _b2*z
>>> getparams(f.mean)
[_b0, _b1, _b2]
>>>
>>>
>>> th = sympy.Symbol('theta')
>>> f.mean*sympy.exp(th)
(_b0*x + _b1*y + _b2*z)*exp(theta)
Expand Down
23 changes: 12 additions & 11 deletions formula/terms.py
Expand Up @@ -3,7 +3,7 @@

class Term(sympy.Symbol):
"""A sympy.Symbol type to represent a term an a regression model
Terms can be added to other sympy expressions with the single
convention that a term plus itself returns itself.
Expand Down Expand Up @@ -59,7 +59,7 @@ def __mul__(self, other):

class Factor(object):
""" A qualitative variable in a regression model
A Factor is similar to R's factor. The levels of the Factor can be
either strings or ints.
"""
Expand All @@ -86,14 +86,15 @@ def __init__(self, name, levels, char='b',
# Check whether they can all be cast to strings or ints without
# loss.
levelsarr = np.asarray(levels)
if levelsarr.ndim == 0 and levelsarr.dtype.kind == 'S':
if levelsarr.ndim == 0 and levelsarr.dtype.kind in ('S', 'O'):
levelsarr = np.asarray(list(levels))

if levelsarr.dtype.kind != 'S': # the levels are not strings

if levelsarr.dtype.kind not in ('S', 'O'):
# the levels are not strings/objects
if not np.alltrue(np.equal(levelsarr, np.round(levelsarr))):
raise ValueError('levels must be strings or ints')
levelsarr = levelsarr.astype(np.int)

self.levels = list(levelsarr)
self.name = name
self._char = char
Expand All @@ -105,15 +106,15 @@ def __init__(self, name, levels, char='b',
`['drop_reference',
'main_effect',
'indicator']`)

self.coding = coding
if reference is None:
self.reference = self.levels[0]
else:
if reference not in self.levels:
raise ValueError('reference should an element of levels')
self.reference = reference

def __getitem__(self, level):
"""
self.get_term(level)
Expand Down Expand Up @@ -158,7 +159,7 @@ def indicator(self):
of the factor.
"""
if not hasattr(self, "_indicator"):
self._indicator = Formula([FactorTerm(self.name, l) for l in
self._indicator = Formula([FactorTerm(self.name, l) for l in
self.levels], char=self._char)
return self._indicator

Expand All @@ -168,7 +169,7 @@ def formula(self):
Return the formula of the Factor = getattr(self, self.coding)
"""
return getattr(self, self.coding)

@staticmethod
def fromcol(col, name):
""" Create a Factor from a column array.
Expand Down Expand Up @@ -233,7 +234,7 @@ def fromrec(recarray):

result = {}
for n, d in recarray.dtype.descr:
if d[1] == 'S':
if d[1] in ('S', 'O'):
result[n] = Factor(n, np.unique(recarray[n]))
else:
result[n] = Term(n)
Expand Down

0 comments on commit acd7fa8

Please sign in to comment.