Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
* dev: (29 commits)
  add an option to control wether to generate datapackage.json
  more tests
  fix build problem
  properly close the progress bar
  update the last_update time after loading other optional info
  add serving section in output of to_recipe
  update hy funcs
  add modeline for syntax
  remove gitattribute
  more examples
  fix name
  try to add syntax highlight for hy file
  add example recipes
  serve entities not in any entity set
  add hy format support
  add ddf_dir option in run_recipe
  update tests
  new api function
  don't remove datapackage.json in cleanup
  comments and logging
  ...
  • Loading branch information
semio committed Nov 29, 2017
2 parents 7223523 + 01a7027 commit 2597f92
Show file tree
Hide file tree
Showing 160 changed files with 1,233 additions and 395 deletions.
15 changes: 15 additions & 0 deletions ddf_utils/chef/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,18 @@

from ddf_utils.chef.cook import Chef


def run_recipe(fn, ddf_dir, out_dir):
"""run the recipe file and serve result"""
from ddf_utils.io import cleanup
import os
if os.path.exists(out_dir):
cleanup(out_dir)
else:
os.mkdir(out_dir)

chef = Chef.from_recipe(fn)
if ddf_dir is not None:
chef.add_config(ddf_dir=ddf_dir)
chef.run(serve=True, outpath=out_dir)
return
46 changes: 34 additions & 12 deletions ddf_utils/chef/cook.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
# -*- coding: utf-8 -*-
"""recipe cooking"""

import os
import sys
import json
import logging
import os
import re
from time import time
from ddf_utils.chef.dag import DAG, ProcedureNode, IngredientNode
from ddf_utils.chef.ingredient import Ingredient
from ddf_utils.chef.helpers import get_procedure
from ddf_utils.chef.exceptions import ChefRuntimeError
import sys
from collections import Mapping
from copy import deepcopy
import json
from time import time

import ruamel.yaml as yaml
from collections import Mapping
from graphviz import Digraph

from ddf_utils.chef.dag import DAG, IngredientNode, ProcedureNode
from ddf_utils.chef.exceptions import ChefRuntimeError
from ddf_utils.chef.helpers import get_procedure
from ddf_utils.chef.ingredient import Ingredient

logger = logging.getLogger('Chef')

Expand Down Expand Up @@ -107,7 +108,20 @@ def from_recipe(cls, recipe_file, **config):
def ingredients(self):
return [x.evaluate() for x in self.dag.nodes if isinstance(x, IngredientNode)]

def copy(self):
return Chef(dag=self.dag.copy(), metadata=deepcopy(self.metadata),
config=deepcopy(self._config), cooking=deepcopy(self.cooking),
serving=deepcopy(self._serving), recipe=deepcopy(self._recipe))

def validate(self):
"""validate if the chef is good to run.
The following will be tested:
1. check if datasets required by ingredients are available
2. check if procedures are available
3. check if the DAG is valid. i.e no dependency cycle, no missing dependency.
"""
# 1. check dataset availability
ddf_dir = self.config['ddf_dir']
datasets = set()
Expand Down Expand Up @@ -139,25 +153,33 @@ def validate(self):
self.dag.get_node(ing['id']).detect_missing_dependency()

def add_config(self, **config):
"""add configs, all keyword args will be added/replace existing in config dictionary"""
for k, v in config.items():
self._config[k] = v
return self

def add_metadata(self, **metadata):
"""add metadata, all keyword args will be added/replace existing in metadata dictionary"""
for k, v in metadata.items():
self.metadata[k] = v
return self

def add_ingredient(self, **kwargs):
"""add a new ingredient in DAG.
keyword arguments will send as a dictionary to the ``dictionary`` keyword of
:py:meth:`ddf_utils.chef.ingredient.Ingredient.from_dict` method. Check ``from_dict()``
doc for available keywords
"""
ingredient = Ingredient.from_dict(chef=self, dictionary=kwargs)
self.dag.add_node(IngredientNode(ingredient.ingred_id, ingredient, self))
return self

def add_procedure(self, collection, procedure, ingredients, result=None, options=None):

if procedure == 'serve':
[self.serving.append({'id': x,
'options': options}) for x in ingredients]
[self._serving.append({'id': x,
'options': options}) for x in ingredients]
return self

# check if procedure is supported
Expand Down Expand Up @@ -236,6 +258,7 @@ def to_recipe(self, fp=None):
recipe['config'] = self.config
recipe['ingredients'] = list()
recipe['cooking'] = dict()
recipe['serving'] = self.serving

for ingredient in self.ingredients:
info = {'id': ingredient.ingred_id,
Expand Down Expand Up @@ -446,4 +469,3 @@ def _get_dishes(recipe):
dishes.append({'id': p['result'], 'options': dict()})

return dishes

5 changes: 5 additions & 0 deletions ddf_utils/chef/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ def __init__(self, node_dict=None):
else:
self._node_dict = node_dict

def copy(self):
from copy import deepcopy
# TODO: I think we should add copy() for Nodes
return DAG(node_dict=deepcopy(self._node_dict))

@property
def roots(self):
"""return the roots of the DAG"""
Expand Down
29 changes: 0 additions & 29 deletions ddf_utils/chef/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,35 +174,6 @@ def query(df, conditions, available_scopes=None):
return df
return df.query(q)


# below functions are not used in ddf_utils yet, but may be useful.
def log_shape(func):
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
logging.info("%s,%s" % (func.__name__, result.shape))
return result
return wrapper


def log_dtypes(func):
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
logging.info("%s,%s" % (func.__name__, result.dtypes))
return result
return wrapper


def log_procedure(func):
@wraps(func)
def wrapper(*args, **kwargs):
logging.info("running %s" % (func.__name__))
result = func(*args, **kwargs)
return result
return wrapper


def debuggable(func):
@wraps(func)
def wrapper(*args, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions ddf_utils/chef/hy_mod/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import hy

from .funcs import *
53 changes: 53 additions & 0 deletions ddf_utils/chef/hy_mod/funcs.hy
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
(import [ddf_utils.chef.api [Chef]])

;; (defn init []
;; (global *chef*)
;; (setv *chef* (Chef)))

(defn info [chef &kwargs kwargs]
(apply (. chef add_metadata) [] kwargs))

(defn config [chef &kwargs kwargs]
(apply (. chef add_config) [] kwargs))

;; (defn ingredients [ingreds]
;; (for [i ingreds] (do
;; (print i)
;; (apply (. *chef* add_ingredient) [] i))))

;; (defn show [&optional how]
;; (cond [(= how "recipe") (.to_recipe *chef*)]
;; [True (print (. *chef* metadata))]))

(defn procedure [chef result collection &kwargs kwargs]
(do
(setv (. kwargs ["collection"]) collection)
(setv (. kwargs ["result"]) result)
(setv kwargs (convert_keyword kwargs))
; (pprint.pprint kwargs)))
(apply (. chef add_procedure) [] kwargs)))

(defn get_name [k]
;;; convert keyword to string. because the default `name` function will
;;; replace underscroce, we create a new function here.
(.replace (name k) "-" "_"))

(defn convert_keyword [d]
(setv new_dict (dict))
(for [(, k v) (.items d)]
(if (instance? dict v)
(setv v_new (convert_keyword v))
(setv v_new v))
(if (keyword? k)
(setv (. new_dict [(get_name k)]) v_new)
(setv (. new_dict [k]) v_new)))
new_dict)

(defn ingredient [chef &kwargs kwargs]
(apply (. chef add_ingredient) [] kwargs))

(defn serve [chef &kwargs kwargs]
(setv (. kwargs ["collection"]) "")
(setv (. kwargs ["procedure"]) "serve")
(apply (. chef add_procedure) [] kwargs))

24 changes: 24 additions & 0 deletions ddf_utils/chef/hy_mod/macros.hy
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

(defmacro init []
`(do
(import [ddf_utils.chef.api [Chef]])
(import [ddf_utils.chef.hy_mod.funcs :as _f])
(setv *chef* (Chef))))

(defmacro info [&rest code]
`(_f.info *chef* ~@code))

(defmacro config [&rest code]
`(_f.config *chef* ~@code))

(defmacro ingredient [&rest code]
`(_f.ingredient *chef* ~@code))

(defmacro procedure [&rest code]
`(_f.procedure *chef* ~@code))

(defmacro serve [&rest code]
`(_f.serve *chef* ~@code))

(defmacro run [&rest code]
`(setv res (*chef*.run ~@code)))
32 changes: 24 additions & 8 deletions ddf_utils/chef/ingredient.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@

"""main ingredient class"""

import os
import logging
import fnmatch
import logging
import os
from collections import Mapping, Sequence

import numpy as np
import pandas as pd
from ..str import format_float_digits
from .helpers import read_opt, gen_sym, query
from collections import Sequence, Mapping

from ddf_utils.model.package import Datapackage
from ddf_utils.model.repo import is_url, Repo
from ddf_utils.model.repo import Repo, is_url

from ..str import format_float_digits
from .exceptions import IngredientError
from .helpers import gen_sym, query, read_opt


class BaseIngredient(object):
Expand Down Expand Up @@ -80,7 +82,7 @@ def _serve_entities(self, outpath, **options):
assert isinstance(data, dict)
assert len(data) == 1
sets = []
no_keep_sets = options.get('no_keep_sets', False)
no_keep_sets = options.get('no_keep_sets', False) # serve as entity domain
for k, df in data.items():
# change boolean into string
for c in df.columns:
Expand All @@ -101,7 +103,21 @@ def _serve_entities(self, outpath, **options):
for s in sets:
path = os.path.join(outpath, 'ddf--entities--{}--{}.csv'.format(k, s))
col = 'is--'+s
df[df[col]=='TRUE'].dropna(axis=1, how='all').to_csv(path, index=False, encoding='utf8')
df_ = df[df[col]=='TRUE'].dropna(axis=1, how='all')
df_ = df_.loc[:, lambda x: ~x.columns.str.startswith('is--')].copy()
df_[col] = 'TRUE'
df_.to_csv(path, index=False, encoding='utf8')
# serve entities not in any sets
is_headers = list(map(lambda x: 'is--'+x, sets))
noset = []
for i, row in df.iterrows():
# import pdb; pdb.set_trace()
if (row[is_headers].fillna('FALSE') == 'FALSE').all():
noset.append(i)
if len(noset) > 0:
df_noset = df.loc[noset].drop(is_headers, axis=1).dropna(axis=1, how='all')
path = os.path.join(outpath, 'ddf--entities--{}.csv'.format(k))
df_noset.to_csv(path, index=False)
else:
path = os.path.join(outpath, 'ddf--entities--{}--{}.csv'.format(domain, k))
df.to_csv(path, index=False, encoding='utf8')
Expand Down
36 changes: 22 additions & 14 deletions ddf_utils/chef/procedure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@

"""all procedures for recipes"""

import fnmatch
import logging
import time
import warnings
import pandas as pd
from collections import Mapping, Sequence
from typing import Dict, List, Optional, Union

import numpy as np
from . dag import DAG
import pandas as pd

from ddf_utils.chef.cook import Chef
from .ingredient import BaseIngredient, ProcedureResult
from .helpers import read_opt, mkfunc, debuggable
from .exceptions import ProcedureError
import time
from typing import List, Union, Dict, Optional
from collections import Sequence, Mapping
import fnmatch
from .helpers import query

import logging
from .dag import DAG
from .exceptions import ProcedureError
from .helpers import debuggable, mkfunc, query, read_opt
from .ingredient import BaseIngredient, ProcedureResult

logger = logging.getLogger('Chef')

Expand Down Expand Up @@ -1006,13 +1007,20 @@ def extract_concepts(chef: Chef, ingredients: List[str], result,
concepts.loc[col, 'concept_type'] = 'measure'
else:
concepts.loc[col, 'concept_type'] = 'string'
# add name column if there isn't one
if 'name' not in concepts.columns:
concepts['name'] = np.nan

if join_type == 'ingredients_outer':
# ingredients_outer join: only keep concepts appears in ingredients
concepts = concepts.ix[new_concepts]

# add name column if there isn't one
if 'name' not in concepts.columns:
concepts['name'] = np.nan
if 'name' not in concepts.index.values:
concepts.loc['name', 'concept_type'] = 'string'
concepts.loc['name', 'name'] = 'Name'
concepts['name'] = concepts['name'].fillna(
concepts.index.to_series().map(lambda x: str(x).replace('_', ' ').title()))

# overwrite some of the types
if overwrite:
for k, v in overwrite.items():
Expand Down

0 comments on commit 2597f92

Please sign in to comment.