## Create a Static EntitySet and then a Static Hypergraph from the Harry Potter character data

In [1]:
from hypernetx import *
import matplotlib.pyplot as plt
from collections import OrderedDict,defaultdict
import scipy
from scipy.sparse import coo_matrix,issparse
import pandas as pd
import numpy as np
import itertools as it
import importlib as imp
import untitled_StaticEntity as us

### Read Dataset from kaggle

In [2]:
## Read dataset in using pandas. Fix index column or use default pandas index. 
harrydata = pd.read_csv('HarryPotter/datasets/Characters_edit.csv',encoding='unicode_escape').set_index('Id')
harrydata = pd.DataFrame(harrydata)
harrydata

Unnamed: 0_level_0,Name,Gender,Job,House,Wand,Patronus,Species,Blood status,Hair colour,Eye colour,Loyalty,Skills,Birth,Death
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Harry James Potter,Male,Student,Gryffindor,"11"" Holly phoenix feather",Stag,Human,Half-blood,Black,Bright green,Albus Dumbledore | Dumbledore's Army | Order o...,Parseltongue| Defence Against the Dark Arts | ...,"Thursday, July 31, 1980",
2,Ronald Bilius Weasley,Male,Student,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Human,Pure-blood,Red,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Wizard chess | Quidditch goalkeeping,"Saturday, March 1, 1980",
3,Hermione Jean Granger,Female,Student,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Human,Muggle-born,Brown,Brown,Dumbledore's Army | Order of the Phoenix | Hog...,Almost everything,"19 September, 1979",
4,Albus Percival Wulfric Brian Dumbledore,Male,Headmaster,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Human,Half-blood,Silver| formerly auburn,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Considered by many to be one of the most power...,Late August 1881,"30 June, 1997"
5,Rubeus Hagrid,Male,Keeper of Keys and Grounds | Professor of Care...,Gryffindor,"16"" Oak unknown core",,Half-Human/Half-Giant,Part-Human,Black,Black,Albus Dumbledore | Order of the Phoenix | Hogw...,Resistant to stunning spells| above average st...,6 December 1928,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Hogwarts kitchens worker (December 1994 - 1998)""",,,,House elf,,Green,,,"A type of magic specific to house-elves, perfo...",44010,"Late March, 1998",,,
140,Kreacher,Male,,,,,,,,,,,,
"Black family's house-elf (?-1996),",,,,,,,,,,,,,,
"Harry Potter's house-elf,",,,,,,,,,,,,,,


In [3]:
## Choose string to fill NaN. These will be set to 0 in system id = sid
harry = harrydata[['House','Blood status','Species','Hair colour','Eye colour']].fillna("Unknown")
for c in harry.columns:
    harry[c] = harry[c].apply(lambda x : x.replace('\xa0',' '))

In [4]:
harry.iloc[:10]

Unnamed: 0_level_0,House,Blood status,Species,Hair colour,Eye colour
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Gryffindor,Half-blood,Human,Black,Bright green
2,Gryffindor,Pure-blood,Human,Red,Blue
3,Gryffindor,Muggle-born,Human,Brown,Brown
4,Gryffindor,Half-blood,Human,Silver| formerly auburn,Blue
5,Gryffindor,Part-Human,Half-Human/Half-Giant,Black,Black
6,Gryffindor,Pure-blood,Human,Blond,Unknown
7,Gryffindor,Pure-blood,Human,Red,Brown
8,Gryffindor,Pure-blood,Human,Red,Brown
9,Gryffindor,Pure-blood,Human,Red,Bright brown
10,Gryffindor,Muggle-born,Human,Black,Brown


### Generate a counter for each column 
- Assign a sid to each value in that column
- Create a reverse counter to grab name from sid

**Questions for Tony and Cliff**
- how should we index the objects? 
- sids are whole numbers starting with column 0 and running through each column
- ldict and rdict are indexed starting with 0 representing missing values
- would we lose anything if we indexed these as -1 for missing values and then
compute the incidence matrix using only nonnegative indices?

In [5]:
ctr = [HNXCount() for c in range(5)]
ldict = OrderedDict()
rdict = OrderedDict()
for idx,c in enumerate(harry.columns):
    ldict[c] = defaultdict(ctr[idx])
    rdict[c] = OrderedDict()
    ldict[c]['Unknown']
    rdict[c][0] = 'Unknown'
    for k in harry[c]:
        ldict[c][k]
        rdict[c][ldict[c][k]] = k
    ldict[c] = dict(ldict[c]) 
dims = tuple([len(ldict[c]) for c in harry.columns])

ldict['House']

{'Unknown': 0,
 'Gryffindor': 1,
 'Ravenclaw': 2,
 'Slytherin': 3,
 'Hufflepuff': 4,
 'Beauxbatons Academy of Magic': 5,
 'Durmstrang Institute': 6,
 'House elf': 7}

In [6]:
rdict['Eye colour'].values()

odict_values(['Unknown', 'Bright green', 'Blue', 'Brown', 'Black', 'Bright brown', 'Hazel', 'Grey', 'Green', 'Dark', 'Pale silvery', 'Silvery', 'Gooseberry', 'Scarlet ', 'Pale, freckled', 'Astonishingly blue', 'Variable', 'One dark, one electric blue', 'Yellowish', 'Ruddy', 'Grey/Blue', 'Dark blue', 'Bright Blue', 'Dark Grey', 'Pale', 'Yellow', '44010'])

### Create an array of tuples giving positions of 1's in incidence Tensor
- The tuples indicate one point across the possible node/edge assignments
- The dimensions of the tuple give the number of unique labels in potential nodes/columns

In [7]:
m = len(harry)
n = len(harry.columns)
data = np.zeros((m,n),dtype=int)
for rid in range(m):
    for cid in range(n):
        c = harry.columns[cid]
        data[rid,cid] = ldict[c][harry.iloc[rid][c]]
dims = tuple([len(ldict[c]) for c in harry.columns])
print(f'{data[:5]}\n\nDimensions = {dims}')

[[1 1 1 1 1]
 [1 2 1 2 2]
 [1 3 1 3 3]
 [1 1 1 4 2]
 [1 4 2 1 4]]

Dimensions = (8, 11, 10, 38, 27)


In [8]:
ldict['House'].keys()

dict_keys(['Unknown', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Beauxbatons Academy of Magic', 'Durmstrang Institute', 'House elf'])

In [9]:
## labeldata replaces numbers in data with their corresponding labels
labeldata = np.array(data,dtype=str)
for rdx in range(m):
    for cdx in range(n):
        c = harry.columns[cdx]
        labeldata[rdx,cdx] = rdict[c][data[rdx,cdx]]
## Make a hypergraph from the House and Blood Status
labeldata[:,:][:5],harrydata['Name'][:5]

(array([['Gryffindor', 'Half-blood', 'Human', 'Black', 'Bright green'],
        ['Gryffindor', 'Pure-blood', 'Human', 'Red', 'Blue'],
        ['Gryffindor', 'Muggle-born', 'Human', 'Brown', 'Brown'],
        ['Gryffindor', 'Half-blood', 'Human', 'Silver| formerly aubu',
         'Blue'],
        ['Gryffindor', 'Part-Human', 'Half-Human/Half-Giant', 'Black',
         'Black']], dtype='<U21'),
 Id
 1                         Harry James Potter
 2                      Ronald Bilius Weasley
 3                      Hermione Jean Granger
 4    Albus Percival Wulfric Brian Dumbledore
 5                              Rubeus Hagrid
 Name: Name, dtype: object)

## Create "Tensor" of incidence

In [10]:
imat = np.zeros(dims,dtype=int)
for d in data:
    imat[tuple(d)] +=1
imat.shape

np.sum(imat)

144

In [11]:
np.sum(imat,axis = (2,3,4))

array([[16,  2,  7,  0,  0, 11,  0,  0,  0,  4,  1],
       [ 2,  7, 14,  5,  1,  9,  0,  0,  0,  0,  0],
       [ 1,  6,  0,  1,  0,  8,  1,  1,  0,  0,  0],
       [ 0,  5, 11,  0,  0, 12,  0,  0,  0,  0,  0],
       [ 1,  4,  2,  1,  0,  5,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [12]:
ldict

OrderedDict([('House',
              {'Unknown': 0,
               'Gryffindor': 1,
               'Ravenclaw': 2,
               'Slytherin': 3,
               'Hufflepuff': 4,
               'Beauxbatons Academy of Magic': 5,
               'Durmstrang Institute': 6,
               'House elf': 7}),
             ('Blood status',
              {'Unknown': 0,
               'Half-blood': 1,
               'Pure-blood': 2,
               'Muggle-born': 3,
               'Part-Human': 4,
               'Pure-blood or half-blood': 5,
               'Part-Goblin': 6,
               'Muggle-born or half-blood': 7,
               'Quarter-Veela': 8,
               'Muggle': 9,
               'Squib': 10}),
             ('Species',
              {'Unknown': 0,
               'Human': 1,
               'Half-Human/Half-Giant': 2,
               'Werewolf': 3,
               'Human (Werewolf traits)': 4,
               'Human(goblin ancestry)': 5,
               'Ghost': 6,
               'Cent

In [13]:
slabels=OrderedDict()
for cdx,c in enumerate(harry.columns):
    slabels.update({c:np.array(list(ldict[c].keys()))})

slabels

OrderedDict([('House',
              array(['Unknown', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff',
                     'Beauxbatons Academy of Magic', 'Durmstrang Institute',
                     'House elf'], dtype='<U28')),
             ('Blood status',
              array(['Unknown', 'Half-blood', 'Pure-blood', 'Muggle-born', 'Part-Human',
                     'Pure-blood or half-blood', 'Part-Goblin',
                     'Muggle-born or half-blood', 'Quarter-Veela', 'Muggle', 'Squib'],
                    dtype='<U25')),
             ('Species',
              array(['Unknown', 'Human', 'Half-Human/Half-Giant', 'Werewolf',
                     'Human (Werewolf traits)', 'Human(goblin ancestry)', 'Ghost',
                     'Centaur', 'Human ', 'Human (Metamorphmagus)'], dtype='<U23')),
             ('Hair colour',
              array(['Unknown', 'Black', 'Red', 'Brown', 'Silver| formerly auburn',
                     'Blond', 'Sandy', 'Auburn', 'Light brown flecked wit

## Create StaticEntity 

In [14]:
imp.reload(us)
E = us.StaticEntity(imat, slabels)
E.dimensions

(8, 11, 10, 38, 27)

In [15]:
E.elements,E.children

(OrderedDict([(0, array([ 0,  1,  2,  5,  9, 10])),
              (1, array([0, 1, 2, 3, 4, 5])),
              (2, array([0, 1, 3, 5, 6, 7])),
              (3, array([1, 2, 5])),
              (4, array([0, 1, 2, 3, 5])),
              (5, array([4, 8])),
              (6, array([5])),
              (7, array([0]))]),
 {'Half-blood',
  'Muggle',
  'Muggle-born',
  'Muggle-born or half-blood',
  'Part-Goblin',
  'Part-Human',
  'Pure-blood',
  'Pure-blood or half-blood',
  'Quarter-Veela',
  'Squib',
  'Unknown'})

In [16]:
E.incidence_matrix(0,1)

array([[1, 1, 1, 0, 1, 0, 0, 1],
       [1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0],
       [1, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0]])

In [17]:
elts = E.elements_by_level(0,1);elts

OrderedDict([(0, array([ 0,  1,  2,  5,  9, 10])),
             (1, array([0, 1, 2, 3, 4, 5])),
             (2, array([0, 1, 3, 5, 6, 7])),
             (3, array([1, 2, 5])),
             (4, array([0, 1, 2, 3, 5])),
             (5, array([4, 8])),
             (6, array([5])),
             (7, array([0]))])

In [18]:
level1,level2 = 0,4
elts = E.elements_by_level(level1,level2)


In [19]:
E.elements_by_level(level1,level2,translate=True)

OrderedDict([('Unknown',
              ['Unknown',
               'Blue',
               'Brown',
               'Black',
               'Grey',
               'Dark',
               'Pale, freckled',
               'Astonishingly blue',
               'One dark, one electric blue',
               'Yellowish',
               'Ruddy',
               'Grey/Blue',
               'Bright Blue',
               'Dark Grey',
               'Pale',
               'Yellow']),
             ('Gryffindor',
              ['Unknown',
               'Bright green',
               'Blue',
               'Brown',
               'Black',
               'Bright brown',
               'Hazel',
               'Grey',
               'Green',
               'Dark']),
             ('Ravenclaw',
              ['Unknown',
               'Blue',
               'Brown',
               'Grey',
               'Dark',
               'Pale silvery',
               'Silvery']),
             ('Slytherin',
             

## Select columns to use from data and create hypergraph

In [20]:
Edx = 0;Ndx = 1
Ename = harry.columns[Edx];Nname = harry.columns[Ndx]

In [21]:
## Add weights to hypergraph 
thisdata = data[:,[Ndx,Edx]].astype(int)
sp,counts = np.unique(thisdata,axis=0,return_counts=True)
sp,counts

(array([[ 0,  0],
        [ 0,  1],
        [ 0,  2],
        [ 0,  4],
        [ 0,  7],
        [ 1,  0],
        [ 1,  1],
        [ 1,  2],
        [ 1,  3],
        [ 1,  4],
        [ 2,  0],
        [ 2,  1],
        [ 2,  3],
        [ 2,  4],
        [ 3,  1],
        [ 3,  2],
        [ 3,  4],
        [ 4,  1],
        [ 4,  5],
        [ 5,  0],
        [ 5,  1],
        [ 5,  2],
        [ 5,  3],
        [ 5,  4],
        [ 5,  6],
        [ 6,  2],
        [ 7,  2],
        [ 8,  5],
        [ 9,  0],
        [10,  0]]),
 array([16,  2,  1,  1,  2,  2,  7,  6,  5,  4,  7, 14, 11,  2,  5,  1,  1,
         1,  1, 11,  9,  8, 12,  5,  1,  1,  1,  2,  4,  1]))

In [22]:
mat = np.zeros((dims[Ndx],dims[Edx]),dtype=int)
for d in range(len(counts)):
    mat[sp[d][0],sp[d][1]] = counts[d]

In [23]:
## the matrix will generate a hypergraph. Non zero entries will be 1's
print(mat[1:,1:])

[[ 7  6  5  4  0  0  0]
 [14  0 11  2  0  0  0]
 [ 5  1  0  1  0  0  0]
 [ 1  0  0  0  1  0  0]
 [ 9  8 12  5  0  1  0]
 [ 0  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]]


In [24]:
## Generate weights for each of the nodes based on counts relative to the whole
M = mat[1:,1:]
tot = np.sum(M)
radii = dict(zip([rdict[Nname][r] for r in range(1,len(rdict[Nname]))],[0.5+3*np.sum(M[r],axis=0)/tot for r in range(dims[Ndx]-1)]))
radii

{'Half-blood': 1.1804123711340206,
 'Pure-blood': 1.3350515463917527,
 'Muggle-born': 0.7164948453608248,
 'Part-Human': 0.5618556701030928,
 'Pure-blood or half-blood': 1.5824742268041236,
 'Part-Goblin': 0.5309278350515464,
 'Muggle-born or half-blood': 0.5309278350515464,
 'Quarter-Veela': 0.5618556701030928,
 'Muggle': 0.5,
 'Squib': 0.5}

In [25]:
## Generate weights for each of the edges in the same way
Mt = M.transpose()
tott = np.sum(Mt)
radiit = OrderedDict(zip([rdict[Ename][r] for r in range(1,len(rdict[Ename]))],[0.5+3*np.sum(Mt[r],axis=0)/tot for r in range(dims[Edx]-1)]))
radiit

OrderedDict([('Gryffindor', 1.6134020618556701),
             ('Ravenclaw', 1.0257731958762886),
             ('Slytherin', 1.365979381443299),
             ('Hufflepuff', 0.8711340206185567),
             ('Beauxbatons Academy of Magic', 0.5927835051546392),
             ('Durmstrang Institute', 0.5309278350515464),
             ('House elf', 0.5)])

In [26]:
rdict[Nname],rdict[Ename]

(OrderedDict([(0, 'Unknown'),
              (1, 'Half-blood'),
              (2, 'Pure-blood'),
              (3, 'Muggle-born'),
              (4, 'Part-Human'),
              (5, 'Pure-blood or half-blood'),
              (6, 'Part-Goblin'),
              (7, 'Muggle-born or half-blood'),
              (8, 'Quarter-Veela'),
              (9, 'Muggle'),
              (10, 'Squib')]),
 OrderedDict([(0, 'Unknown'),
              (1, 'Gryffindor'),
              (2, 'Ravenclaw'),
              (3, 'Slytherin'),
              (4, 'Hufflepuff'),
              (5, 'Beauxbatons Academy of Magic'),
              (6, 'Durmstrang Institute'),
              (7, 'House elf')]))

In [27]:
edges = [rdict[Ename][k] for k in range(1,dims[Edx])]
nodes = [rdict[Nname][k] for k in range(1,dims[Ndx])]
hmat = np.where(M >0, 1,0)

In [28]:
hmat

array([[1, 1, 1, 1, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 0],
       [1, 1, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0],
       [1, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [29]:
harryshyp = hnx.Hypergraph.from_numpy_array(hmat,node_names=nodes,edge_names=edges)

In [30]:
m,rd,cd = harryshyp.incidence_matrix(index=True)  ## how to force the incidence matrix to agree with order of input?

## by setting a static flag on construction the hmat will be treated as arr and node_names, edge_names will be turned into labels
## in this case the hmat is already the incidence matrix

In [31]:
m.todense()

matrix([[1, 0, 1, 1, 0, 0],
        [1, 0, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [1, 0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 0]])

In [32]:
rdict['Blood status'],rd

(OrderedDict([(0, 'Unknown'),
              (1, 'Half-blood'),
              (2, 'Pure-blood'),
              (3, 'Muggle-born'),
              (4, 'Part-Human'),
              (5, 'Pure-blood or half-blood'),
              (6, 'Part-Goblin'),
              (7, 'Muggle-born or half-blood'),
              (8, 'Quarter-Veela'),
              (9, 'Muggle'),
              (10, 'Squib')]),
 {0: 'Muggle-born',
  1: 'Half-blood',
  2: 'Pure-blood or half-blood',
  3: 'Quarter-Veela',
  4: 'Part-Goblin',
  5: 'Pure-blood',
  6: 'Part-Human',
  7: 'Muggle-born or half-blood'})

<img src="HarryPotter/bloodstatus-house.png">

In [33]:
E.level('Half-blood')


(1, 1)

In [34]:
E._arr.__dict__

AttributeError: 'numpy.ndarray' object has no attribute '__dict__'

In [None]:
E.elements_by_level(1,0,translate=True)