## Create a Static EntitySet and then a Static Hypergraph from the Harry Potter character data

In [1]:
from hypernetx import *
import matplotlib.pyplot as plt
from collections import OrderedDict,defaultdict
import scipy
from scipy.sparse import coo_matrix,issparse
import pandas as pd
import numpy as np
import itertools as it
import importlib as imp
import untitled_StaticEntity as us

### Read Dataset from kaggle

In [2]:
## Read dataset in using pandas. Fix index column or use default pandas index. 
harrydata = pd.read_csv('HarryPotter/datasets/Characters_edit.csv',encoding='unicode_escape').set_index('Id')
harrydata = pd.DataFrame(harrydata)
harrydata

Unnamed: 0_level_0,Name,Gender,Job,House,Wand,Patronus,Species,Blood status,Hair colour,Eye colour,Loyalty,Skills,Birth,Death
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Harry James Potter,Male,Student,Gryffindor,"11"" Holly phoenix feather",Stag,Human,Half-blood,Black,Bright green,Albus Dumbledore | Dumbledore's Army | Order o...,Parseltongue| Defence Against the Dark Arts | ...,"Thursday, July 31, 1980",
2,Ronald Bilius Weasley,Male,Student,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Human,Pure-blood,Red,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Wizard chess | Quidditch goalkeeping,"Saturday, March 1, 1980",
3,Hermione Jean Granger,Female,Student,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Human,Muggle-born,Brown,Brown,Dumbledore's Army | Order of the Phoenix | Hog...,Almost everything,"19 September, 1979",
4,Albus Percival Wulfric Brian Dumbledore,Male,Headmaster,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Human,Half-blood,Silver| formerly auburn,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Considered by many to be one of the most power...,Late August 1881,"30 June, 1997"
5,Rubeus Hagrid,Male,Keeper of Keys and Grounds | Professor of Care...,Gryffindor,"16"" Oak unknown core",,Half-Human/Half-Giant,Part-Human,Black,Black,Albus Dumbledore | Order of the Phoenix | Hogw...,Resistant to stunning spells| above average st...,6 December 1928,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Wilhelmina Grubbly-Plank,Female,Substitute professor of Care of Magical Creatu...,,Unknown,Non-corporeal,Human,,Grey,,Hogwarts School of Witchcraft and Wizardry,,,
137,Fenrir Greyback,Male,,,Unknown,,Werewolf,,Grey,,Lord Voldemort | Death Eaters,Physical combat,Pre 1945,
138,Gellert Grindelwald,Male,Revolutionary leader(c. 1920s[6]  1945),,"15"", Elder, Thestral tail hair core",,Human,Pure-blood or half-blood,Blond,Blue,Gellert Grindelwald's Acolytes,Duelling,"Saturday, February 25, 1905","March, 1998"
139,Dobby,Male,"Malfoy family's house-elf (? - 1993),",,,,Elf,,,,Hogwarts School of Witchcraft and Wizardry,,,


In [3]:
## Choose string to fill NaN. These will be set to 0 in system id = sid
harry = harrydata[['House','Blood status','Species','Hair colour','Eye colour']].fillna("Unknown")
for c in harry.columns:
    harry[c] = harry[c].apply(lambda x : x.replace('\xa0',' ')).apply(lambda x: x.replace('Unknown',f'Unknown {c}'))

In [4]:
harry

Unnamed: 0_level_0,House,Blood status,Species,Hair colour,Eye colour
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Gryffindor,Half-blood,Human,Black,Bright green
2,Gryffindor,Pure-blood,Human,Red,Blue
3,Gryffindor,Muggle-born,Human,Brown,Brown
4,Gryffindor,Half-blood,Human,Silver| formerly auburn,Blue
5,Gryffindor,Part-Human,Half-Human/Half-Giant,Black,Black
...,...,...,...,...,...
136,Unknown House,Unknown Blood status,Human,Grey,Unknown Eye colour
137,Unknown House,Unknown Blood status,Werewolf,Grey,Unknown Eye colour
138,Unknown House,Pure-blood or half-blood,Human,Blond,Blue
139,Unknown House,Unknown Blood status,Elf,Unknown Hair colour,Unknown Eye colour


### Generate a counter for each column 
- Assign a sid to each value in that column
- Create a reverse counter to grab name from sid

**Questions for Tony and Cliff**
- how should we index the objects? 
- sids are whole numbers starting with column 0 and running through each column
- ldict and rdict are indexed starting with 0 representing missing values
- would we lose anything if we indexed these as -1 for missing values and then
compute the incidence matrix using only nonnegative indices?

In [5]:
n = len(harry.columns)
ctr = [HNXCount() for c in range(n)]
ldict = OrderedDict()
rdict = OrderedDict()
for idx,c in enumerate(harry.columns):
    ldict[c] = defaultdict(ctr[idx])
    rdict[c] = OrderedDict()
    ldict[c][f'Unknown {c}']
    rdict[c][0] = f'Unknown {c}'
    for k in harry[c]:
        ldict[c][k]
        rdict[c][ldict[c][k]] = k
    ldict[c] = dict(ldict[c]) 
dims = tuple([len(ldict[c]) for c in harry.columns])
dims

(7, 11, 10, 36, 26)

In [6]:
rdict['Eye colour'].values()

odict_values(['Unknown Eye colour', 'Bright green', 'Blue', 'Brown', 'Black', 'Bright brown', 'Hazel', 'Grey', 'Green', 'Dark', 'Pale silvery', 'Silvery', 'Gooseberry', 'Scarlet ', 'Pale, freckled', 'Astonishingly blue', 'Variable', 'One dark, one electric blue', 'Yellowish', 'Ruddy', 'Grey/Blue', 'Dark blue', 'Bright Blue', 'Dark Grey', 'Pale', 'Yellow'])

### Create an array of tuples giving positions of 1's in incidence Tensor
- The tuples indicate one point across the possible node/edge assignments
- The dimensions of the tuple give the number of unique labels in potential nodes/columns

In [7]:
m = len(harry)
n = len(harry.columns)
data = np.zeros((m, n), dtype=int)
for rid in range(m):
    for cid in range(n):
        c = harry.columns[cid]
        data[rid, cid] = ldict[c][harry.iloc[rid][c]]
udata = np.unique(data,axis=0,return_counts=True)

In [8]:
np.concatenate([udata[0],np.expand_dims(udata[1],axis=1)],axis=1)

array([[ 0,  0,  0,  0,  0,  1],
       [ 0,  0,  1,  0,  0,  2],
       [ 0,  0,  1,  1,  0,  1],
       [ 0,  0,  1,  1,  3,  1],
       [ 0,  0,  1,  1,  4,  1],
       [ 0,  0,  1,  2,  3,  1],
       [ 0,  0,  1, 19,  0,  1],
       [ 0,  0,  1, 19, 25,  1],
       [ 0,  0,  1, 26, 18,  1],
       [ 0,  0,  3, 19,  0,  1],
       [ 0,  0,  6, 20,  4,  1],
       [ 0,  0,  7, 16, 15,  1],
       [ 0,  0,  9,  0,  0,  1],
       [ 0,  1,  1, 19, 22,  1],
       [ 0,  1,  1, 29,  3,  1],
       [ 0,  2,  1,  2,  7,  1],
       [ 0,  2,  1, 15,  0,  1],
       [ 0,  2,  1, 18,  4,  1],
       [ 0,  2,  1, 19,  0,  1],
       [ 0,  2,  1, 19, 17,  1],
       [ 0,  2,  1, 27, 20,  1],
       [ 0,  2,  1, 31, 23,  1],
       [ 0,  5,  1,  0,  0,  2],
       [ 0,  5,  1,  1,  0,  1],
       [ 0,  5,  1,  3, 19,  1],
       [ 0,  5,  1,  5,  2,  1],
       [ 0,  5,  1, 10,  9,  1],
       [ 0,  5,  1, 15,  0,  1],
       [ 0,  5,  1, 19,  0,  1],
       [ 0,  5,  1, 21, 14,  1],
       [ 0

In [10]:
len(remove_row_duplicates(data,add_counts=False))

126

In [11]:
ldict['House'].keys()

dict_keys(['Unknown House', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Beauxbatons Academy of Magic', 'Durmstrang Institute'])

In [12]:
## labeldata replaces numbers in data with their corresponding labels
labeldata = np.array(data,dtype=str)
for rdx in range(m):
    for cdx in range(n):
        c = harry.columns[cdx]
        labeldata[rdx,cdx] = rdict[c][data[rdx,cdx]]
## Make a hypergraph from the House and Blood Status
labeldata[:,:][:5],harrydata['Name'][:5]

(array([['Gryffindor', 'Half-blood', 'Human', 'Black', 'Bright green'],
        ['Gryffindor', 'Pure-blood', 'Human', 'Red', 'Blue'],
        ['Gryffindor', 'Muggle-born', 'Human', 'Brown', 'Brown'],
        ['Gryffindor', 'Half-blood', 'Human', 'Silver| formerly aubu',
         'Blue'],
        ['Gryffindor', 'Part-Human', 'Half-Human/Half-Giant', 'Black',
         'Black']], dtype='<U21'),
 Id
 1                         Harry James Potter
 2                      Ronald Bilius Weasley
 3                      Hermione Jean Granger
 4    Albus Percival Wulfric Brian Dumbledore
 5                              Rubeus Hagrid
 Name: Name, dtype: object)

## Create "Tensor" of incidence and remove duplicates

In [13]:
imat = np.zeros(dims,dtype=int)
dedupe = OrderedDict()
for d in data:
    imat[tuple(d)] +=1
imat.shape

np.sum(imat)

140

In [14]:
np.sum(imat,axis = (2,3,4,5))

AxisError: axis 5 is out of bounds for array of dimension 5

In [None]:
ldict

In [None]:
slabels=OrderedDict()
for cdx,c in enumerate(harry.columns):
    slabels.update({c:np.array(list(ldict[c].keys()))})

slabels

## Create StaticEntity 

In [None]:
imp.reload(us)
E = us.StaticEntity(imat, slabels)
E.dimensions

In [None]:
E.elements,E.children

In [None]:
E.incidence_matrix(0,1)

In [None]:
elts = E.elements_by_level(0,1);elts

In [None]:
level1,level2 = 0,4
elts = E.elements_by_level(level1,level2)


In [None]:
E.elements_by_level(level1,level2,translate=True)

## Select columns to use from data and create hypergraph

In [None]:
Edx = 0;Ndx = 1
Ename = harry.columns[Edx];Nname = harry.columns[Ndx]

In [None]:
## Add weights to hypergraph 
thisdata = data[:,[Ndx,Edx]].astype(int)
sp,counts = np.unique(thisdata,axis=0,return_counts=True)
sp,counts

In [None]:
mat = np.zeros((dims[Ndx],dims[Edx]),dtype=int)
for d in range(len(counts)):
    mat[sp[d][0],sp[d][1]] = counts[d]

In [None]:
## the matrix will generate a hypergraph. Non zero entries will be 1's
print(mat[1:,1:])

In [None]:
## Generate weights for each of the nodes based on counts relative to the whole
M = mat[1:,1:]
tot = np.sum(M)
radii = dict(zip([rdict[Nname][r] for r in range(1,len(rdict[Nname]))],[0.5+3*np.sum(M[r],axis=0)/tot for r in range(dims[Ndx]-1)]))
radii

In [None]:
## Generate weights for each of the edges in the same way
Mt = M.transpose()
tott = np.sum(Mt)
radiit = OrderedDict(zip([rdict[Ename][r] for r in range(1,len(rdict[Ename]))],[0.5+3*np.sum(Mt[r],axis=0)/tot for r in range(dims[Edx]-1)]))
radiit

In [None]:
rdict[Nname],rdict[Ename]

In [None]:
edges = [rdict[Ename][k] for k in range(1,dims[Edx])]
nodes = [rdict[Nname][k] for k in range(1,dims[Ndx])]
hmat = np.where(M >0, 1,0)

In [None]:
hmat

In [None]:
harryshyp = hnx.Hypergraph.from_numpy_array(hmat,node_names=nodes,edge_names=edges)

In [None]:
m,rd,cd = harryshyp.incidence_matrix(index=True)  ## how to force the incidence matrix to agree with order of input?

## by setting a static flag on construction the hmat will be treated as arr and node_names, edge_names will be turned into labels
## in this case the hmat is already the incidence matrix

In [None]:
m.todense()

In [None]:
rdict['Blood status'],rd

<img src="HarryPotter/bloodstatus-house.png">

In [None]:
E.level('Half-blood')


In [None]:
E

In [None]:
E.uidset

In [None]:
fig,ax = plt.subplots(figsize=(15,10))
hnx.draw(harryshyp,node_radius=radii)

In [None]:
import networkx as nx
layout = nx.spring_layout
layout(harryshyp.bipartite())