In [29]:
!date

Mon Mar 27 11:05:01 CST 2017


# Running recipes in Jupyter notebook

In this notebook we will demostrate how to use the chef module to create a dataset

In [30]:
import os
import pandas as pd
import json
import yaml
import numpy as np

from ddf_utils import chef

import logging
logging.basicConfig(level=logging.DEBUG)

## loading recipes

recipes can be either yaml or json files. Because recipe may include other recipes and translation dictionaries, chef module provides a build_recipe() function to read recipe and expand the recipe with other included files.

In [31]:
print(open("../tests/recipes_pass/test_translate_column.yaml").read())  # Note: you should change the ddf_dir to correct path

info:
  id: test_translate_column

ingredients:
  - id: bp-geo
    dataset: ddf--bp--energy
    key: geo
    value: '*'
  - id: gw-countries
    dataset: ddf--gapminder--geo_entity_domain
    key: country
    value: '*'
  - id: bp-datapoints
    dataset: ddf--bp--energy
    key: 'geo,year'
    value: '*'

cooking:
  entities:
    - procedure: translate_column
      ingredients:
        - bp-geo
      options:
        column: name
        target_column: geo_new
        dictionary:
          base: gw-countries
          key: ['alternative_1', 'alternative_2', 'alternative_3',
                'alternative_4_cdiac', 'pandg', 'god_id', 'alt_5', 'upper_case_name',
                'arb1', 'arb2', 'arb3', 'arb4',
                'arb5', 'arb6', 'name']
          value: country
          not_found: drop
      result: geo-aligned
  datapoints:
    - procedure: translate_column
      ingredients:
        - bp-datapoints
      options:
        column: geo
        target_column: geo
        diction

In [32]:
recipe = chef.build_recipe("../tests/recipes_pass/test_translate_column.yaml")

In [33]:
from pprint import pprint

pprint(recipe)

{'cooking': {'datapoints': [{'ingredients': ['bp-datapoints'],
                             'options': {'column': 'geo',
                                         'dictionary': {'base': 'geo-aligned',
                                                        'key': 'geo',
                                                        'value': 'geo_new'},
                                         'not_found': 'drop',
                                         'target_column': 'geo'},
                             'procedure': 'translate_column',
                             'result': 'bp-datapoints-aligned'}],
             'entities': [{'ingredients': ['bp-geo'],
                           'options': {'column': 'name',
                                       'dictionary': {'base': 'gw-countries',
                                                      'key': ['alternative_1',
                                                              'alternative_2',
                                                  

## running the recipe

the run_recipe() function from chef module actually run the recipe and return the result DDF as a dictionary of ingredients.

In [34]:
chef.config.DDF_SEARCH_PATH = '../tests/datasets'

res = chef.run_recipe(recipe)

INFO:root:path for searching DDF: ../tests/datasets
INFO:Chef:translate_column: bp-geo
DEBUG:Chef:running on: geo
INFO:Chef:translate_column: bp-datapoints
DEBUG:Chef:running on: biofuels_production_kboed
DEBUG:Chef:running on: biofuels_production_ktoe


In [35]:
for r in res:
    print(r)

<ProcedureResult: geo-aligned>
<ProcedureResult: bp-datapoints-aligned>


### check results

In [36]:
entities = res[0].get_data()

In [37]:
entities.keys()  # in the recipe we include all geo entities from GW, so they are.

dict_keys(['geo'])

In [38]:
entities['geo'].head()

Unnamed: 0,geo,name,geo_new
0,us,US,usa
1,canada,Canada,can
2,mexico,Mexico,mex
4,argentina,Argentina,arg
5,brazil,Brazil,bra


In [39]:
dps = res[1].get_data()

In [40]:
dps.keys()

dict_keys(['biofuels_production_kboed', 'biofuels_production_ktoe'])

In [41]:
dps['biofuels_production_kboed'].head()

Unnamed: 0,geo,year,biofuels_production_kboed
0,arg,1990,0.0
1,arg,1991,0.0
2,arg,1992,0.0
3,arg,1993,0.0
4,arg,1994,0.0
