<a href="https://colab.research.google.com/github/pradh/api-python/blob/svg/notebooks/Topic_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use this notebook to generate Topic nodes for Data Commons.

### Input Format

```
- Demographics
--- Count_Person
-- Race: Population by Race
--- Count_Person_Asian
--- Count_Person_Hispanic
```

* Topics (e.g., `Demographics`) are prefixed with `-`.

* Topics optionally have SV peer groups (e.g., `Race`), which are containers for comparable SVs, and prefixed with `--`.

* SVs are identified by their DCIDs, and are prefixed with `---`. They may either be directly under a topic (e.g., `Count_Person`) or an SV peer group (e.g., `Count_Person_Asian`).

* The DCIDs of Topic and SV peer groups are auto-generated from their names.
 They can also have an optional display title delimited by `:` (e.g., for `Race` above).

In [5]:
# @title First, enter a URL or copy/paste the topic definition

# URL is preferred

CONTENT_URL = ""

# OR #

CONTENT = """

- Agriculture
--- Area_Farm
--- Count_Farm
--- Income_Farm
-- Amount of Farm Inventory by Type
--- AmountFarmInventory_WinterWheatForGrain
--- Amount_FarmInventory_BarleyForGrain
--- Amount_FarmInventory_CornForSilageOrGreenchop
--- Amount_FarmInventory_Cotton

"""

In [6]:
# @title Next, run this cell to generate MCF

import urllib.request

# @title Next, run this cell and copy the generated output

def get_id(name):
  replace_pairs = [
      (' ', ''),
      ('%', 'Pct'),
      ('&', 'And'),
      ('-', ''),
      ('+', ''),
      ('(', ''),
      (')', ''),
      ('[', ''),
      (']', ''),
      ('{', ''),
      ('}', ''),
      ('=', ''),
      (':', ''),
  ]
  for old, new in replace_pairs:
      name = name.replace(old, new)
  return name


def get_id_name(name):
  parts = name.split(':', 1)
  var_id = get_id(parts[0].strip())
  if len(parts) > 1:
    var_name = parts[1].strip()
  else:
    var_name = parts[0].strip()
  return var_id, var_name


def map2mcf(topic):
  if not topic:
    return []
  mcf_blocks = []
  for tid in sorted(topic):
    name = topic[tid]['name']
    topic_parts = [
        f'Node: dcid:{tid}',
        'typeOf: dcs:Topic',
        f'name: "{name}"',
    ]
    for sv in sorted(topic[tid].get('svs')):
      topic_parts.append(f'relevantVariable: dcid:{sv}')
    svpg_blocks = []
    for svpg_id in sorted(topic[tid].get('svpgs')):
      topic_parts.append(f'relevantVariable: dcid:{svpg_id}')
      svpg = topic[tid]['svpgs'][svpg_id]
      name = svpg['name']
      svpg_parts = [
          f'Node: dcid:{svpg_id}',
          'typeOf: dcs:StatVarPeerGroup',
          f'name: "{name}"',
      ]
      for sv in sorted(svpg['svs']):
        svpg_parts.append(f'member: dcid:{sv}')
      svpg_blocks.append('\n'.join(svpg_parts))
    mcf_blocks.append('\n'.join(topic_parts))
    mcf_blocks += svpg_blocks

  return mcf_blocks

def get_mcf(lines):
  topic = {}
  svpg_id = None
  topic_id = None
  for line in lines.splitlines():
      line = line.strip()
      if not line:
          continue

      indents, name = line.strip().split(' ', 1)
      nlevel = len(indents.strip())
      name = name.strip()

      if indents.startswith('#') or indents.startswith('//'):
        continue

      if not indents.startswith('-'):
        print('# ERROR: Bogus line:', line)
        continue

      if len(indents) == 1:
        # Topic
        topic_id, name = get_id_name(name)
        topic_id = f'dc/topic/{topic_id}'
        topic[topic_id] = {
            'name': name,
            'svpgs': {},
            'svs': [],
        }
        svpg_id = None
      elif len(indents) == 2:
        assert topic_id, line
        # SVPG
        svpg_id, name = get_id_name(name)
        svpg_id = f'dc/svpg/{topic_id}_{svpg_id}'
        topic[topic_id]['svpgs'][svpg_id] = {
            'name': name,
            'svs': []
        }
      elif len(indents) == 3:
        assert topic_id, line
        # SV
        if svpg_id:
          topic[topic_id]['svpgs'][svpg_id]['svs'].append(name)
        else:
          topic[topic_id]['svs'].append(name)
      else:
        print('# ERROR: Too many `-`:', line)
        continue

  return map2mcf(topic)


def run(text):
  mcf = get_mcf(text)
  print('# These are auto-generated \n')
  print('\n\n'.join(mcf))

if CONTENT_URL:
  with urllib.request.urlopen(CONTENT_URL) as response:
    run(response.read().decode('utf-8'))
else:
  run(CONTENT)

# These are auto-generated 

Node: dcid:dc/topic/Agriculture
typeOf: dcs:Topic
name: "Agriculture"
relevantVariable: dcid:Area_Farm
relevantVariable: dcid:Count_Farm
relevantVariable: dcid:Income_Farm
relevantVariable: dcid:dc/svpg/dc/topic/Agriculture_AmountofFarmInventorybyType

Node: dcid:dc/svpg/dc/topic/Agriculture_AmountofFarmInventorybyType
typeOf: dcs:StatVarPeerGroup
name: "Amount of Farm Inventory by Type"
member: dcid:AmountFarmInventory_WinterWheatForGrain
member: dcid:Amount_FarmInventory_BarleyForGrain
member: dcid:Amount_FarmInventory_CornForSilageOrGreenchop
member: dcid:Amount_FarmInventory_Cotton
