##  pandas DataFrame generate n-level hierarchical JSON

* hierarchical data
* mapping pandas columns
* Pretty print json and dataframe split
* generate n-level hierarchical JSON

In [1]:
import pandas as pd
import json
df = pd.DataFrame(
    {
        'parent_id': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102, 2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033],
        'child_id': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001, 3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311]
    }
)

df.head()

Unnamed: 0,parent_id,child_id
0,3111,4321
1,2010,3102
2,3000,4023
3,1000,2010
4,4023,5321


In [2]:
lst = json.loads(df.to_json(orient='split'))['data']

# Build a directed graph and a list of all names that have no parent
graph = {name: set() for tup in lst for name in tup}
has_parent = {name: False for tup in lst for name in tup}
for parent, child in lst:
    graph[parent].add(child)
    has_parent[child] = True

# All names that have absolutely no parent:
roots = [name for name, parents in has_parent.items() if not parents]

# traversal of the graph (doesn't care about duplicates and cycles)
def traverse(hierarchy, graph, names):
    for name in names:
        hierarchy[name] = traverse({}, graph, graph[name])
    return hierarchy

result = traverse({}, graph, roots)

In [3]:
import json
print(json.dumps(result, indent=2))

{
  "1000": {
    "2010": {
      "3011": {
        "4200": {},
        "4010": {}
      },
      "3102": {
        "4001": {}
      }
    },
    "2100": {
      "3033": {
        "4113": {},
        "4311": {}
      }
    },
    "2110": {
      "3000": {
        "4023": {
          "5321": {},
          "5010": {
            "6016": {},
            "6100": {},
            "6525": {}
          }
        }
      },
      "3111": {
        "4321": {}
      }
    }
  }
}


## Column Mapping

In [4]:
df.parent_id = df.parent_id.astype('category')

In [5]:
df.child_id = df.child_id.astype('category')

In [6]:
df['child_id'].cat.categories

Int64Index([2010, 2100, 2110, 3000, 3011, 3033, 3102, 3111, 4001, 4010, 4023,
            4113, 4200, 4311, 4321, 5010, 5321, 6016, 6100, 6525],
           dtype='int64')

In [7]:
df['parent_id'].cat.categories

Int64Index([1000, 2010, 2100, 2110, 3000, 3011, 3033, 3102, 3111, 4023, 5010], dtype='int64')

In [8]:
df['parent_id_new'] = df.parent_id.map({1000:"A",	2010:"B",	2100:"C",	2110:"D",	3000:"E",	3011:"F",	3033:"G",	3102:"H",	3111:"I",	4023:"K",	5010:"L"
})

In [9]:
df['child_id_new'] = df.child_id.map({1000:"A",	2010:"B",	2100:"C",	2110:"D",	3000:"E",	3011:"F",	3033:"G",	3102:"H",	3111:"I",	4023:"K",	5010:"L",	4001:"M",	4010:"N",	4113:"O",	4200:"P",	4311:"Q",	4321:"R",	6016:"S",	6525:"T",	6100:"U",	5321:"V"

})

In [10]:
df

Unnamed: 0,parent_id,child_id,parent_id_new,child_id_new
0,3111,4321,I,R
1,2010,3102,B,H
2,3000,4023,E,K
3,1000,2010,A,B
4,4023,5321,K,V
5,3011,4200,F,P
6,3033,4113,G,O
7,5010,6525,L,T
8,3011,4010,F,N
9,3102,4001,H,M


## Pretty print json and dataframe split

In [11]:
res = df.to_dict(orient='split')
import json
print(json.dumps(res, indent=2))

{
  "index": [
    0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19
  ],
  "columns": [
    "parent_id",
    "child_id",
    "parent_id_new",
    "child_id_new"
  ],
  "data": [
    [
      3111,
      4321,
      "I",
      "R"
    ],
    [
      2010,
      3102,
      "B",
      "H"
    ],
    [
      3000,
      4023,
      "E",
      "K"
    ],
    [
      1000,
      2010,
      "A",
      "B"
    ],
    [
      4023,
      5321,
      "K",
      "V"
    ],
    [
      3011,
      4200,
      "F",
      "P"
    ],
    [
      3033,
      4113,
      "G",
      "O"
    ],
    [
      5010,
      6525,
      "L",
      "T"
    ],
    [
      3011,
      4010,
      "F",
      "N"
    ],
    [
      3102,
      4001,
      "H",
      "M"
    ],
    [
      2010,
      3011,
      "B",
      "F"
    ],
    [
      4023,
      5010,
      "K",
      "L"
    ],
    [
      2110,
      3000,


In [12]:
res = df.to_dict()
import json
print(json.dumps(res, indent=2))

{
  "parent_id": {
    "0": 3111,
    "1": 2010,
    "2": 3000,
    "3": 1000,
    "4": 4023,
    "5": 3011,
    "6": 3033,
    "7": 5010,
    "8": 3011,
    "9": 3102,
    "10": 2010,
    "11": 4023,
    "12": 2110,
    "13": 2100,
    "14": 1000,
    "15": 5010,
    "16": 2110,
    "17": 1000,
    "18": 5010,
    "19": 3033
  },
  "child_id": {
    "0": 4321,
    "1": 3102,
    "2": 4023,
    "3": 2010,
    "4": 5321,
    "5": 4200,
    "6": 4113,
    "7": 6525,
    "8": 4010,
    "9": 4001,
    "10": 3011,
    "11": 5010,
    "12": 3000,
    "13": 3033,
    "14": 2110,
    "15": 6100,
    "16": 3111,
    "17": 2100,
    "18": 6016,
    "19": 4311
  },
  "parent_id_new": {
    "0": "I",
    "1": "B",
    "2": "E",
    "3": "A",
    "4": "K",
    "5": "F",
    "6": "G",
    "7": "L",
    "8": "F",
    "9": "H",
    "10": "B",
    "11": "K",
    "12": "D",
    "13": "C",
    "14": "A",
    "15": "L",
    "16": "D",
    "17": "A",
    "18": "L",
    "19": "G"
  },
  "child_id_new": {
  

## Traverse a graph

In [13]:
lst = [('Linux','Debian'), ('Linux','Red Hat'), ('Debian','Ubuntu'), ('Debian','Knoppix'), 
       ('Ubuntu','Linux Mint'), ('Red Hat','CentOS'), ('Red Hat','Mandrake')]

# Build a directed graph and a list of all names that have no parent
graph = {name: set() for tup in lst for name in tup}
has_parent = {name: False for tup in lst for name in tup}
for parent, child in lst:
    graph[parent].add(child)
    has_parent[child] = True

# All names that have absolutely no parent:
roots = [name for name, parents in has_parent.items() if not parents]

# traversal of the graph (doesn't care about duplicates and cycles)
def traverse(hierarchy, graph, names):
    for name in names:
        hierarchy[name] = traverse({}, graph, graph[name])
    return hierarchy

nested_json = traverse({}, graph, roots)

In [14]:
import json
print(json.dumps(nested_json, indent=2))

{
  "Linux": {
    "Debian": {
      "Ubuntu": {
        "Linux Mint": {}
      },
      "Knoppix": {}
    },
    "Red Hat": {
      "Mandrake": {},
      "CentOS": {}
    }
  }
}


In [15]:
# Build a directed graph and a list of all names that have no parent

graph = {name: set() for tup in lst for name in tup}
has_parent = {name: False for tup in lst for name in tup}
for parent, child in lst:
    graph[parent].add(child)
    has_parent[child] = True

graph    

{'Linux': {'Debian', 'Red Hat'},
 'Debian': {'Knoppix', 'Ubuntu'},
 'Red Hat': {'CentOS', 'Mandrake'},
 'Ubuntu': {'Linux Mint'},
 'Knoppix': set(),
 'Linux Mint': set(),
 'CentOS': set(),
 'Mandrake': set()}

In [16]:
has_parent

{'Linux': False,
 'Debian': True,
 'Red Hat': True,
 'Ubuntu': True,
 'Knoppix': True,
 'Linux Mint': True,
 'CentOS': True,
 'Mandrake': True}

In [17]:
# All names that have absolutely no parent:
roots = [name for name, parents in has_parent.items() if not parents]
roots

['Linux']

In [18]:
import pandas as pd
import json

df = pd.DataFrame(
    {
        'parent_id': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102, 2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033],
        'child_id': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001, 3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311]
    }
)

df.head()

Unnamed: 0,parent_id,child_id
0,3111,4321
1,2010,3102
2,3000,4023
3,1000,2010
4,4023,5321


In [19]:
lst = json.loads(df.to_json(orient='split'))['data']

# Build a directed graph and a list of all names that have no parent
graph = {name: set() for tup in lst for name in tup}
has_parent = {name: False for tup in lst for name in tup}
for parent, child in lst:
    graph[parent].add(child)
    has_parent[child] = True

# All names that have absolutely no parent:
roots = [name for name, parents in has_parent.items() if not parents]

# traversal of the graph (doesn't care about duplicates and cycles)
def traverse(hierarchy, graph, names):
    for name in names:
        hierarchy[name] = traverse({}, graph, graph[name])
    return hierarchy

result = traverse({}, graph, roots)

In [20]:
import json
print(json.dumps(result, indent=2))

{
  "1000": {
    "2010": {
      "3011": {
        "4200": {},
        "4010": {}
      },
      "3102": {
        "4001": {}
      }
    },
    "2100": {
      "3033": {
        "4113": {},
        "4311": {}
      }
    },
    "2110": {
      "3000": {
        "4023": {
          "5321": {},
          "5010": {
            "6016": {},
            "6100": {},
            "6525": {}
          }
        }
      },
      "3111": {
        "4321": {}
      }
    }
  }
}
