# Generate hif format for example datasets

Here we illustrate examples of HIF formatted json objects using datasets available from HNX and XGI.

In [1]:
import hypernetx as hnx
import pandas as pd
import numpy as np
import json
import warnings
import random

import fastjsonschema

In [2]:
warnings.simplefilter('ignore')

## Load schema and validator

In [3]:
schema = json.load(open('../schemas/hif_schema_v0.1.0.json','r'))
validator = fastjsonschema.compile(schema)

In [4]:
schema

{'$schema': 'http://json-schema.org/draft-07/schema#',
 '$id': 'https://raw.githubusercontent.com/pszufe/HIF_validators/main/schemas/hif_schema_v0.1.0.json',
 'title': 'Hypergraph Interchange Format v0.1.0',
 'type': 'object',
 'properties': {'network-type': {'enum': ['undirected', 'directed', 'asc']},
  'metadata': {'type': 'object'},
  'incidences': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'edge': {'type': ['string', 'integer']},
     'node': {'type': ['string', 'integer']},
     'weight': {'type': 'number'},
     'direction': {'enum': ['head', 'tail']},
     'attrs': {'type': 'object'}},
    'additionalProperties': {'type': ['string',
      'integer',
      'number',
      'boolean']},
    'unevaluatedProperties': False,
    'required': ['edge', 'node']}},
  'nodes': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'node': {'type': ['string', 'integer']},
     'weight': {'type': 'number'},
     'attrs': {'type': 'object'}},
    'addition

## Example from HyperNetX Toys

The LesMis data was developed out of the [Stanford GraphBase]( https://www-cs-faculty.stanford.edu/~knuth/sgb.html).

The hypergraph relates characters to the scenes they participate in. As scenes are indexed relative to a hierarchy, we index the hyperedges by a string of numeric indices referencing the Volume, Book, Chapter, and Scene.  

Character's are indexed by a two letter Symbol. 

Additional metadata is associated with each character including the characters fullname and description.  

Since no metadata is associated to the hyperedges, the hif format will only include the incidences and nodes.  

In [5]:
from hypernetx.utils import toys

lesmis = toys.LesMis()
characters = lesmis.df_names
scenes = lesmis.df_scenes
scenes

Unnamed: 0,Volume,Book,Chapter,Scene,Step,Characters
0,1,1,1,0,0,MY
1,1,1,1,0,0,NP
2,1,1,1,1,1,MY
3,1,1,1,1,1,MB
4,1,1,2,0,2,MY
...,...,...,...,...,...,...
857,5,9,4,1,400,MA
858,5,9,4,1,400,CO
859,5,9,5,0,401,JV
860,5,9,5,0,401,CO


In [6]:
characters

Unnamed: 0,Symbol,FullName,Description
0,AZ,Anzelma,daughter of TH and TM
1,BA,Bahorel,`Friends of the ABC' cutup
2,BB,Babet,tooth-pulling bandit of Paris
3,BJ,Brujon,notorious criminal
4,BL,Blacheville,Parisian student from Montauban
...,...,...,...
75,TS,Toussaint,servant of JV at Rue Plumet
76,VI,Madame Victurnien,snoop in M-- sur M--
77,XA,Child 1,son of TH sold to MN
78,XB,Child 2,son of TH sold to MN


In [7]:
scenes["edge"] = [
    ".".join([str(scenes.loc[idx][col]) for col in scenes.columns[:-2]])
    for idx in scenes.index
]
scenes["node"] = scenes["Characters"]
incidences = scenes[["edge", "node"]]
cell_weights = incidences.groupby(["edge"]).count().to_dict()["node"]
incidences["weight"] = incidences.edge.map(lambda e: np.round(1 / cell_weights.get(e, 1), 2))

incidences = incidences.to_dict(orient="records")
nodes = characters.rename(columns={"Symbol":"node"}).to_dict(orient="records")


In [8]:
lesmis_hif = {
    "incidences": incidences,
    "nodes": nodes,
}
try:
    output = validator(lesmis_hif)
    for string in ['incidences','nodes','edges']:
        print(output.get(string,[])[:3],'\n')
except Exception as exception:
    print(exception)
    

[{'edge': '1.1.1.0', 'node': 'MY', 'weight': 0.5}, {'edge': '1.1.1.0', 'node': 'NP', 'weight': 0.5}, {'edge': '1.1.1.1', 'node': 'MY', 'weight': 0.5}] 

[{'node': 'AZ', 'FullName': 'Anzelma', 'Description': ' daughter of TH and TM'}, {'node': 'BA', 'FullName': 'Bahorel', 'Description': " `Friends of the ABC' cutup"}, {'node': 'BB', 'FullName': 'Babet', 'Description': ' tooth-pulling bandit of Paris'}] 

[] 



In [17]:
## We add metadata and network type to better inform other libraries that might use the data
lesmis_hif["network-type"] = "undirected"
json.dump(lesmis_hif, open('lesmis-hif.json','w'))

## Example from XGI
Contact-High-School originally sourced from:
https://www.cs.cornell.edu/~arb/data/contact-high-school/

This example is already in json form, but not in the HIF standard. There are NaNs, which won't play well with some hypergraph libraries.

In general, a property that belongs to all items in the incidences or edges, or nodes, should either be included in the HIF standard under properties, or should satisfy the additionalProperties schema. A property that only belongs to some of these items should be included in the attrs object for the item.

In [10]:
chs = json.load(open('contacts-high-school.json','r'))
chs.keys()

dict_keys(['nodes', 'hyperedges'])

In [11]:
chsnodes = pd.DataFrame(chs['nodes'])
chsnodes

Unnamed: 0,class,id,has_facebook,has_compiled_questionnaire,facebook_friends,sex,questionnaire_friends
0,MP,454,True,False,"[34, 151, 156, 159, 866, 640, 1232]",F,
1,MP,640,True,False,"[151, 159, 454]",M,
2,2BIO3,1,True,True,"[883, 132, 339, 653, 545, 171, 117, 196, 587, ...",M,"[205, 272, 494, 779, 894]"
3,2BIO3,939,True,False,"[1, 55, 101, 106, 117, 119, 122, 132, 156, 170...",M,
4,PC*,185,False,False,,M,
...,...,...,...,...,...,...,...
322,MP,34,True,True,"[20, 513, 1332, 502, 454, 1870, 201, 245, 387,...",Unknown,"[277, 502, 866]"
323,2BIO2,239,False,False,,F,
324,2BIO1,62,True,False,"[428, 425, 544, 122, 275]",F,
325,PSI*,452,True,True,"[20, 34, 151, 156, 246, 387, 440, 691, 513, 86...",M,"[634, 691, 869, 1332]"


Since the columns **facebook_friends** and **questionnaire_friends** contain NaNs we will put those properties in the attrs object for each node.

In [12]:
chsnodes = pd.DataFrame(chs['nodes']).fillna("None")
cols = list(chsnodes.columns)
cols = ['id','class'] + cols[2:] ## reordering columns so node id is first (optional)
chsnodes = chsnodes[cols]

chsnodes['attrs'] = [{} for i in range(len(chsnodes))]
for idx in chsnodes.index:
    for col in ['facebook_friends','questionnaire_friends']:
        if chsnodes.loc[idx][col] != "None":
            chsnodes.loc[idx]['attrs'][col] = chsnodes.loc[idx][col]
for col in ['facebook_friends','questionnaire_friends']:
    del chsnodes[col]
chsnodes = chsnodes.rename(columns={'id':'node'})

chsnodes = chsnodes.to_dict(orient='records')

In [13]:
chs["hyperedges"][:3]

[{'interaction': [454, 640], 'time': 1385982020},
 {'interaction': [1, 939], 'time': 1385982020},
 {'interaction': [185, 258], 'time': 1385982020}]

The hyperedges include a timestamp. HIF uses "edge" instead of "hyperedge" and pairs each edge and node incident to it in the incidences object of HIF. Since no edge name is given we will index them by their order in the list.  

The "time" property references the entire edge not a single incidence, so it will go into the additionalProperties of the "edges" object in HIF.

We include the network-type and metadata for the source in the HIF format.

In [14]:
chshyper = pd.DataFrame(chs['hyperedges']).reset_index().rename(columns={"index":"edge"})
chsinc = chshyper['interaction'].explode().reset_index().rename(columns={'index':'edge','interaction':'node'})

chsedges = chshyper[['edge','time']].to_dict(orient='records')
chsinc = chsinc.to_dict(orient="records")

In [15]:
chs_hif = {
    "network-type":"undirected",
    "metadata":{
        "source": "https://www.cs.cornell.edu/~arb/data/contact-high-school/"
    },
    "incidences": chsinc,
    "edges": chsedges,
    "nodes": chsnodes
}

try:
    output = validator(chs_hif)
    for string in ['incidences','nodes','edges']:
        print(output.get(string,[])[:3],'\n')
except Exception as exception:
    print(exception)

[{'edge': 0, 'node': 454}, {'edge': 0, 'node': 640}, {'edge': 1, 'node': 1}] 

[{'node': 454, 'class': 'MP', 'has_facebook': True, 'has_compiled_questionnaire': False, 'sex': 'F', 'attrs': {'facebook_friends': [34, 151, 156, 159, 866, 640, 1232]}}, {'node': 640, 'class': 'MP', 'has_facebook': True, 'has_compiled_questionnaire': False, 'sex': 'M', 'attrs': {'facebook_friends': [151, 159, 454]}}, {'node': 1, 'class': '2BIO3', 'has_facebook': True, 'has_compiled_questionnaire': True, 'sex': 'M', 'attrs': {'facebook_friends': [883, 132, 339, 653, 545, 171, 117, 196, 587, 372, 147, 55, 859, 106, 504, 471, 425, 170, 939, 272, 3, 119, 494, 205, 265, 779, 364, 240, 477, 101, 884], 'questionnaire_friends': [205, 272, 494, 779, 894]}}] 

[{'edge': 0, 'time': 1385982020}, {'edge': 1, 'time': 1385982020}, {'edge': 2, 'time': 1385982020}] 



In [16]:
json.dump(chs_hif, open('contacts-high-school-hif.json','w'))