# Develop and test hash based node storage

The following implementation requires a postgres database, to work and query json data types. To run this notebook requires therefore to setup a postgres server. I use the following tool, which however works only on a Mac: [https://postgresapp.com](https://postgresapp.com)

Add in pyiron_workflows in function.py in def function_node_factory the following code line:

    "__annotate__": node_function.__annotate if hasattr(node_function, '__annotate__') else None,

### Set path, import etc.

In [1]:
import sys
from pathlib import Path
#sys.path.remove('/Users/joerg/python_projects/git_libs/pyiron_snippets')
sys.path.remove('/Users/joerg/python_projects/git_libs/pyiron_workflow')
sys.path.insert(0, str(Path(Path.cwd()).parent))

In [2]:
# sys.path

In [3]:
import warnings
warnings.filterwarnings("ignore")

%config IPCompleter.evaluation='unsafe'

In [4]:
from pyiron_nodes.development import hash_based_storage as hs



### Perform a few tests for the hash_based_storage module

In [5]:
db = hs.create_nodes_table(echo=False)

hs.list_column_names(db, 'node')

['node_id',
 'name',
 'hash_value',
 'lib_path',
 'creation_date',
 'inputs',
 'outputs',
 'output_ready',
 'file_path']

**Note:** Uncomment the following line if you have modified the table structure. After deleting the table comment this block and rerun the notebook (to run create_nodes_table)

In [6]:
hs.drop_table(db, 'node')
# hs.list_column_names(db, 'node')
db = hs.create_nodes_table(echo=False)

In [7]:
hs.list_table(db)

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path


In [8]:
hs.remove_nodes_from_db(db, indices=[12, 13]);

In [9]:
df = hs.list_table(db)
df

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path


In [10]:
hs.transform_data_column(df)

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,outputs,output_ready,file_path


In [11]:
hs.db_query_dict(db, x='[17]')

### Connect to pyiron_workflow

In [12]:
from pyiron_workflow import Workflow
import pyiron_nodes as pn

In [13]:
sin = pn.math.Sin(x=[16])
sin.outputs.sin.value

NOT_DATA

In [14]:
df = hs.list_table(db)
df

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path


In [15]:
# hs.get_json_size(hs.extract_node_output(sin))# , 
hs.extract_node_output(sin)

{'sin': 'NOT_DATA'}

In [16]:
sin.inputs['x'].connections

[]

In [17]:
hs.extract_node_output(sin, as_string=True)

{'sin': 'NOT_DATA'}

In [18]:
# hs.get_all_connected_input_nodes(sin)
sin.inputs['x'].connections
id(sin)
for label in sin.inputs.labels:
    print (sin.inputs[label].connected)

False


In [19]:
hs.save_node(sin, db) 

1

In [20]:
hs.get_node_storage_path(sin)

'pyiron_nodes/math/Sin'

In [21]:
hs.list_table(db)

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path
0,1,,7b760e45dd171478eccbca742bfe01fe13ee15dd5ae35b...,pyiron_nodes/math/Sin,2024-11-19 20:55:22.496389,{'x': '[16]'},{'sin': '[-0.28790332]'},True,


In [22]:
sin = pn.math.Sin(x=[12])

# sin12 = hs.run_node(sin, db, verbose=True)

In [23]:
sin_new = hs.create_node('pyiron_nodes.math.Sin')
sin_new

<pyiron_nodes.math.Sin at 0x14c44fe30>

In [24]:
%%time
for node_id in range(2):
    sin2 = hs.get_node_from_db_id(node_id, db)
    if sin2 is not None:
        print (f'Node id: {node_id}, inputs: {sin2.inputs.x.value}, outputs: {sin2.outputs.sin.value}')
    else:
        print (f'Node id: {node_id} does not exist')

Node id: 0 does not exist
Node id: 1, inputs: [16], outputs: [-0.28790332]
CPU times: user 1.45 ms, sys: 503 μs, total: 1.95 ms
Wall time: 2.01 ms


### Real world example

In [25]:
from pyiron_workflow import Workflow                                                                        
import pyiron_nodes as pn

In [26]:
get_pot = pn.atomistic.property.thermodynamics.GetChemicalPotential(element='Pt')
get_pot.run()

{'chemical_potential': -0.00012460841212025286}

In [27]:
get_pot  # why are inputs/outputs for macro not visible?

<pyiron_nodes.atomistic.property.thermodynamics.GetChemicalPotential at 0x108b25ac0>

In [28]:
hs.save_node(get_pot, db)

2

In [29]:
hs.list_table(db)

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path
0,1,,7b760e45dd171478eccbca742bfe01fe13ee15dd5ae35b...,pyiron_nodes/math/Sin,2024-11-19 20:55:22.496389,{'x': '[16]'},{'sin': '[-0.28790332]'},True,
1,2,,87aa77c6dc5c0e95104ad993eaa7cec21665162d8d2cd3...,pyiron_nodes/atomistic/property/thermodynamics...,2024-11-19 20:55:22.705563,"{'element': 'Pt', 'engine': 'None'}",{'chemical_potential': '-0.00012460841212025286'},True,


In [30]:
# hs.remove_nodes_from_db(db, [26])

In [31]:
get_pot2 = pn.atomistic.property.thermodynamics.GetChemicalPotential(element='Al')

In [32]:
node = hs.run_node(get_pot2, db, verbose=True)
node.inputs.element.value

run_node:  GetChemicalPotential (GetChemicalPotential):
Inputs ['element', 'engine']
OutputsWithInjection ['chemical_potential']
InputSignals ['run', 'accumulate_and_run']
OutputSignals ['ran', 'failed']


'Al'

## Workflow with node as input (convert input node rather than its output to hash)

### Structure-repeat example

In [33]:
Al = pn.atomistic.structure.build.Bulk('Al')
repeat = pn.atomistic.structure.transform.Repeat(structure=Al, repeat_scalar=3)
out = repeat.pull()

In [34]:
hs.get_import_path(repeat)

'pyiron_nodes.atomistic.structure.transform.Repeat'

In [35]:
nodes = hs.get_all_connected_input_nodes(repeat)
hs.get_import_path(nodes['structure'])

'pyiron_nodes.atomistic.structure.build.Bulk'

In [36]:
inp_node = hs.get_all_connected_input_nodes(repeat)['structure']
inp_node.owner
hs.get_node_hash(inp_node.owner, db)

'13cd1fc92443dc022115e14186c183aa1d676f1460961c23067c883916944e30'

### Chemical potential - engine example

In [37]:
engine = pn.atomistic.engine.ase.M3GNet()

In [38]:
elastic_M3GNet = pn.atomistic.property.thermodynamics.GetChemicalPotential(element='Pt', engine=engine)

In [39]:
hs.save_node(elastic_M3GNet, db) 

5

In [40]:
new_node = hs.get_node_from_db_id(5, db)
hs.extract_node_output(new_node)

{'chemical_potential': '-6.065089225769043'}

In [41]:
'a'.split()

['a']

In [42]:
hs.extract_node_output(hs.run_node(new_node, db))

{'chemical_potential': '-6.065089225769043'}

#### Some TODO issues

In [43]:
# TODO: replace hash by node value (it works but is confusing)

hs.extract_node_input(new_node, db)

{'element': 'Pt',
 'engine': 'hash_9447c7f7d5b2506b782b0bef46506999ada65d71a795d3ac8c32d4fbf6eed13e'}

In [44]:
node = hs.get_node_from_db_id(3, db)
node.inputs.engine

<pyiron_workflow.channels.InputData at 0x168c5b590>

In [45]:
node = hs.get_node_from_db_id(5, db)
hash = hs.extract_node_input(node, db)['engine']
engine_node = hs.eval_db_value(hash, db).pull()


In [46]:
elastic_M3GNet.inputs.engine = hs.extract_node_input(hs.get_node_from_db_id(5, db), db)['engine']

In [47]:
# it looks like the following assignment/connection does not work 

elastic_M3GNet.inputs.engine = engine_node

In [48]:
new_node.inputs.engine

<pyiron_workflow.channels.InputData at 0x168a4a1e0>

### Elastic constants

#### With locally defined macro 

In [49]:
from pyiron_workflow import Workflow

wf = Workflow("elastic_constants") 
atomistic = pn.atomistic
from pyiron_nodes.atomistic.property.elastic import InputElasticTensor

wf.engine = atomistic.engine.ase.M3GNet() 
wf.supercell = atomistic.structure.build.CubicBulkCell(element='Al', cell_size=3, vacancy_index=0)
wf.calc = atomistic.calculator.ase.Static(structure=wf.supercell, engine=wf.engine)
wf.elastic = atomistic.property.elastic.ElasticConstants(structure=wf.supercell, 
                                                         engine=wf.engine,
                                                         parameters=InputElasticTensor.dataclass(eps_range=0.01))
out = wf.run()

print (f'Bulkmodulus: {out.elastic__elastic.BV} GPa')



Bulkmodulus: 53.492619696968546 GPa


In [50]:
'storage' in wf.supercell.inputs.channel_dict.keys()
# storage.value.dataclass.hash_output

False

In [51]:
from pyiron_workflow import Workflow
import pyiron_nodes as pn
atomistic = pn.atomistic
from pyiron_nodes.atomistic.property.elastic import InputElasticTensor

@Workflow.wrap.as_macro_node('BV')
def compute_elastic_constants(wf, element='Fe', parameters=InputElasticTensor.dataclass(eps_range=0.02)): 
    
    wf.engine = atomistic.engine.ase.M3GNet()
    wf.supercell = atomistic.structure.build.CubicBulkCell(element=element, cell_size=3, vacancy_index=0)
    wf.calc = atomistic.calculator.ase.Static(structure=wf.supercell, engine=wf.engine)
    wf.elastic = atomistic.property.elastic.ElasticConstants(structure=wf.supercell, engine=wf.engine, parameters=parameters)
    return wf.elastic.outputs.elastic.BV
    
out = compute_elastic_constants(element='Ni').pull() #iter(eps_range=np.linspace(1e-4, 1, 11))
out

{'BV': 207.853238427253}

In [52]:
out

{'BV': 207.853238427253}

In [53]:
for el in ['Ni', 'Al', 'Fe']:
    elastic_node = compute_elastic_constants(element=el, parameters=InputElasticTensor(eps_range=0.001))
    hs.run_node(elastic_node, db)

run_node:  compute_elastic_constants (compute_elastic_constants):
Inputs ['element', 'parameters']
OutputsWithInjection ['BV']
InputSignals ['run', 'accumulate_and_run']
OutputSignals ['ran', 'failed']
run_node:  compute_elastic_constants (compute_elastic_constants):
Inputs ['element', 'parameters']
OutputsWithInjection ['BV']
InputSignals ['run', 'accumulate_and_run']
OutputSignals ['ran', 'failed']
run_node:  compute_elastic_constants (compute_elastic_constants):
Inputs ['element', 'parameters']
OutputsWithInjection ['BV']
InputSignals ['run', 'accumulate_and_run']
OutputSignals ['ran', 'failed']


Note: 
- Works well
- But: for workflows/functions defined in notebook the node cannot be recreated (this is a fundamental issue, save code in db?)

#### With a node defined in the library

In [54]:
engine = pn.atomistic.engine.ase.M3GNet()
structure = pn.atomistic.structure.build.CubicBulkCell('Al', cell_size=3)
elastic = pn.atomistic.property.elastic.ElasticConstants(structure=structure, engine=engine) #, parameters=parameters)

import cloudpickle as pickle
from pyiron_nodes.development.settings import Storage

pcl = pickle.dumps(elastic)
pickle.loads(pcl)

<pyiron_nodes.atomistic.property.elastic.ElasticConstants at 0x30a844200>

In [55]:
elastic.draw();

To make the following line work I had to use cloudpickle, which creates rather large data files. The size could be dramatically reduced when using my json-based concepts.

In [56]:
%%time
out = hs.run_node(elastic, db)
out.outputs.to_value_dict()

Node written to pyiron_nodes/atomistic/property/elastic/ElasticConstants/11/project.pkl
CPU times: user 9.78 s, sys: 1.6 s, total: 11.4 s
Wall time: 2.65 s


{'elastic': OutputElasticAnalysis(BV=54.76303912584876, GV=16.76953645626507, EV=45.64905821777222, nuV=0.3610709615267214, S=array([[ 0.0278311 , -0.01087214, -0.01087214,  0.        ,  0.        ,
          0.        ],
        [-0.01087214,  0.0278311 , -0.01087214,  0.        ,  0.        ,
          0.        ],
        [-0.01087214, -0.01087214,  0.0278311 ,  0.        ,  0.        ,
          0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.05171518,  0.        ,
          0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.05171518,
          0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.05171518]]), BR=54.76303912584876, GR=16.13119270096839, ER=44.06675850300753, nuR=0.3658865565582793, BH=54.76303912584876, GH=16.45036457861673, EH=44.859305469548026, nuH=0.36347450705922707, AVR=1.940211574783087, energy_0=-399.76531982421875, strain_energy=[[(-0.005, -399.6789855957031), (-0.0025, 

In [57]:
elastic._settings.outputs.settings.value.hash_output == True

True

In [58]:
wf.elastic.inputs

<pyiron_workflow.io.Inputs at 0x168c689b0>

In [59]:
hs.list_table(db)

Unnamed: 0,node_id,name,hash_value,lib_path,creation_date,inputs,outputs,output_ready,file_path
0,1,,7b760e45dd171478eccbca742bfe01fe13ee15dd5ae35b...,pyiron_nodes/math/Sin,2024-11-19 20:55:22.496389,{'x': '[16]'},{'sin': '[-0.28790332]'},True,
1,2,,87aa77c6dc5c0e95104ad993eaa7cec21665162d8d2cd3...,pyiron_nodes/atomistic/property/thermodynamics...,2024-11-19 20:55:22.705563,"{'element': 'Pt', 'engine': 'None'}",{'chemical_potential': '-0.00012460841212025286'},True,
2,3,,a646aeb88e3106cc00ecbf4d181840b4e33beb45a81124...,pyiron_nodes/atomistic/property/thermodynamics...,2024-11-19 20:55:22.781322,"{'element': 'Al', 'engine': 'None'}",,False,
3,4,,9447c7f7d5b2506b782b0bef46506999ada65d71a795d3...,pyiron_nodes/atomistic/engine/ase/M3GNet,2024-11-19 20:55:22.812974,{},,False,
4,5,,6df6cbc74ca4803190726aa36c509319321d977f51d5f6...,pyiron_nodes/atomistic/property/thermodynamics...,2024-11-19 20:55:22.815578,"{'element': 'Pt', 'engine': 'hash_9447c7f7d5b2...",{'chemical_potential': '-6.065089225769043'},True,
5,6,,e53d422081270f966e7cdd5206baec55cbf16174fa31b7...,pyiron_nodes/atomistic/property/elastic/InputE...,2024-11-19 20:55:31.808686,"{'num_of_point': '5', 'eps_range': '0.001', 's...",,False,
6,7,,cd0e1c9d548e217028cc43761fdd2d6c9f45dc7038a6df...,__main__/compute_elastic_constants,2024-11-19 20:55:34.715374,"{'element': 'Ni', 'parameters': 'hash_e53d4220...",,False,
7,8,,4163ac27de2ece234587c70408474b0e4d9c4ab91acf39...,__main__/compute_elastic_constants,2024-11-19 20:55:37.089738,"{'element': 'Al', 'parameters': 'hash_e53d4220...",,False,
8,9,,baa1f2a9a1a3dea4fedb0e06969bbdd4e351e293d428e2...,__main__/compute_elastic_constants,2024-11-19 20:55:39.046250,"{'element': 'Fe', 'parameters': 'hash_e53d4220...",,False,
9,10,,4b3fb2a64ff904139fd3be9ef6ee4074f1a62b5dad9ff5...,pyiron_nodes/atomistic/structure/build/CubicBu...,2024-11-19 20:55:39.089565,"{'element': 'Al', 'cell_size': '3', 'vacancy_i...",,False,


In [60]:
hs.get_node_from_db_id(11, db)

<pyiron_nodes.atomistic.property.elastic.ElasticConstants at 0x30ab45ca0>

In [61]:
hs.get_node_from_db_id(11, db).outputs.elastic.value.BV

54.76303912584876

In [62]:
import pyiron_nodes as pn

wf = Workflow('phonons')
wf.engine = pn.atomistic.engine.ase.M3GNet()
wf.bulk = pn.atomistic.structure.build.CubicBulkCell('Al', cell_size=3)
wf.phonopy = pn.atomistic.property.phonons.CreatePhonopy(structure=wf.bulk, engine=wf.engine) #, parameters=parameters)
wf.dos = pn.atomistic.property.phonons.GetTotalDos(phonopy=wf.phonopy.outputs.phonopy)

# wf.run()

In [63]:
session = db.Session()

# Check if a node with this node_id exists
q = session.query(db.Node).filter_by(node_id=10).scalar()
q.inputs, q.output_ready

({'element': 'Al', 'cell_size': '3', 'vacancy_index': 'None'}, False)

In [64]:
engine = db.Session().get_bind()
engine.pool.status()

'Pool size: 5  Connections in pool: 0 Current Overflow: -4 Current Checked out connections: 1'

### Summary

The hash-based storage approach works well for all cases studied. The only drawback is the need for cloudpickle, which is not a format for storing data and also results in file sizes much larger than what is needed to store the actual data. 

In the present implementation, all nodes are hashed and included in the database. Since we use pull in 'run_node', the output is only stored for the last node where pull is applied. This simple schema fails if the last node is a plot routine, where the output should typically not be stored but the lower lying nodes should be output stored.

A possible solution would be to introduce default flags for output storage. All (interactive) plots should get a false flag, also atomic structures etc.