# Create prototype owl database for pyiron (proof of concept)

Aim: Demonstrate the key steps to build up a simple ontology using owlready2, apply a reasoner and store it in an owl file. I used here owlready since it appears to be presently the only tool where a reasoner is available (which is important for the ontology community). However, it would be easy to realize the same functionality with database tools such as sqlalchemy.

In [1]:
import os

import owlready2 as owl
import pandas

owlready2 depends on Java; you may already have a java executable and you are free to use it instead, but by default we will use the java executable provided by openjdk that was added to your conda environment when you installed pyiron_ontology (if you didn't have openjdk already).

In [2]:
# from shutil import which
# which('java')
owl.JAVA_EXE = os.path.join(os.environ.get("CONDA_PREFIX"), "lib", "jvm", "bin", "java")

## Construct owl file fully in python and convert it into a csv file

This is useful for demonstration purposes. In the actual implementation we will start directly with the csv file that can be obtained e.g. from the pyironObject or from the ironFlow nodes.

In [3]:
onto = owl.get_ontology("file://pyiron.owl")

with onto:
    class PyObject(owl.Thing):
        comment = 'my pyiron object'
    
    class Parameter(PyObject):
        @property
        def description(self):
            if len(self.generic_parameter) > 0:
                return self.generic_parameter[0].description
            
    class InputParameter(Parameter): 
        # @property
        def consistent_output(self, additional_conditions=None):
            if additional_conditions is None:
                conditions = self.has_conditions
            else:
                conditions = additional_conditions
            if len(conditions) > 0:    
                return [p for p in self.generic_parameter[0].has_parameters if is_subset(conditions, p.has_options)]
                # return list(set.intersection(*map(set,[i.has_optional_objects for i in conditions])))
            return self.generic_parameter[0].has_parameters

    # class MandatoryInputParameter(InputParameter): pass         
        
    class OutputParameter(Parameter): pass
    class GenericParameter(Parameter): 
        description = ""
          
    class Code(Parameter): pass
        # generic_parameter=[Executable]
    # class AtomicStructure(Parameter):
    #     decription='Atomic structure. Contains at least positions and chemical species.'
    class Label(PyObject): pass    
    
    class has_conditions(Parameter >> Label): pass
    class has_transitive_conditions(Parameter >> Label): pass  # condition to fulfill to fulfill option in code 
    class has_options(Parameter >> Label): pass

    class has_transitive_objects(Label >> Parameter):
        inverse_property = has_transitive_conditions     
        
    class has_conditional_objects(Label >> Parameter):
        inverse_property = has_conditions  
        
    class has_optional_objects(Label >> Parameter):
        inverse_property = has_options    
        
    class has_symbol(GenericParameter >> str):
        class_property_type = ["some"]
        python_name = "symbols" 
        
    class has_unit(Parameter >> str):
        class_property_type = ["some"]
        python_name = "unit"        
        
    class is_in_domains (Parameter >> Label): 
        class_property_type = ["some"]
        python_name = "domain" 
        
    class domain_has_codes(Label >> Parameter):
        python_name = "has_objects" 
        inverse_property = is_in_domains   
        
    class has_generic_parameter(Parameter >> GenericParameter):
        class_property_type = ["some"]
        python_name = "generic_parameter" 
        
    class generic_parameter_has(GenericParameter >> Parameter):
        python_name = "has_parameters" 
        inverse_property = has_generic_parameter        
        
    class is_input_of (InputParameter >> Code):
        class_property_type = ["some"]
        python_name = "input_in"  
        
    class has_input (Code >> InputParameter):
        python_name = "input" 
        inverse_property = is_input_of  
        
    class is_mandatory_input_of (InputParameter >> Code):
        class_property_type = ["some"]
        python_name = "mandatory_input_in"  
        
    class has_mandatory_input (Code >> InputParameter):
        python_name = "mandatory_input" 
        inverse_property = is_mandatory_input_of          
        
    class is_output_of (OutputParameter >> Code):
        class_property_type = ["some"]
        python_name = "output_of"      # python name has to be unique (even for different class)
        
    class has_output (Code >> OutputParameter):
        python_name = "output" 
        inverse_property = is_output_of          
        
    owl.AllDisjoint([InputParameter, OutputParameter, Label])
    
    
lblAtomistic = Label(name='Atomistic')    
lblCode = Label(name='lCode')
lblDFT = Label(name='DFT')
lblMaterialProperty = Label(name='MaterialProperty')
lblPeriodicBoundaryConditions = Label(name='PeriodicBoundaryConditions')
lblUserInput = Label(name='UserInput', comment='Easy to provide input. Can be used to start a workflow')
lblBulk3DCrystal = Label(name='Bulk3dStructure', comment='Bulk 3d structure generated/needed. Has a well defined volume.')
lblAtomisticEnergyCalculator = Label(name='AtomisticEnergyCalculator', comment='Code to compute the energy of an atomic structure')


ChemicalElement = GenericParameter(name='ChemicalElement', 
                                   description='Single chemical element', 
                                   domain=[lblAtomistic, lblUserInput])
AtomicStructure = GenericParameter(name='AtomicStructure', 
                                   description='Contains all information to construct an atomic structure (molecule, crystal, etc.)', 
                                   domain=[lblAtomistic])
Executable = GenericParameter(name='Executable', description='Code that requires input and produces output', 
                                   domain=[])

# Structure
CreateStructureBulk = Code(name='CreateStructureBulk', 
                           domain=[lblAtomistic, lblCode], 
                           generic_parameter=[Executable]) 
CreateStructureBulk_element = InputParameter(name=f'{CreateStructureBulk.name}/input/element', 
                                             mandatory_input_in=[CreateStructureBulk], 
                                             generic_parameter=[ChemicalElement])  
CreateStructureBulk_structure = OutputParameter(name=f'{CreateStructureBulk.name}/output/structure', output_of=[CreateStructureBulk]
                                                , generic_parameter=[AtomicStructure], has_options=[lblBulk3DCrystal]) 

CreateSurface = Code(name='CreateSurface', 
                           domain=[lblAtomistic, lblCode], 
                           generic_parameter=[Executable]) 
CreateSurface_element = InputParameter(name=f'{CreateSurface.name}/input/element', 
                                             mandatory_input_in=[CreateSurface], 
                                             generic_parameter=[ChemicalElement])  
CreateSurface_structure = OutputParameter(name=f'{CreateSurface.name}/output/structure', output_of=[CreateSurface]
                                                , generic_parameter=[AtomicStructure], has_options=[]) 


# Murnaghan
Bulkmodulus = GenericParameter(name='Bulk_modulus', 
                               description='https://en.wikipedia.org/wiki/Bulk_modulus', 
                               symbols=['B', 'K'], 
                               unit=['MPa'], 
                               domain=[lblMaterialProperty])
Bprime = GenericParameter(name='B_prime', 
                          decription='First derivative of Bulk modulus with respect to volume', 
                          symbols=['Bprime'], 
                          unit=['1'], 
                          domain=[lblMaterialProperty])

Murnaghan = Code(name='Murnaghan', 
                 domain=[lblAtomistic, lblCode], 
                 generic_parameter=[Executable]) 
Murnaghan_Bulkmodulus = OutputParameter(name=f'{Murnaghan.name}/output/equilibrium_bulk_modulus', 
                                        output_of=[Murnaghan], 
                                        generic_parameter=[Bulkmodulus], 
                                        unit=['GPa']) 
Murnaghan_Bprime = OutputParameter(name=f'{Murnaghan.name}/output/equilibrium_b_prime', 
                                   output_of=[Murnaghan], 
                                   generic_parameter=[Bprime])                                                                                                                                                                                                                                            
Murnaghan_Ref_Job = InputParameter(name=f'{Murnaghan.name}/ref_job', 
                                   mandatory_input_in=[Murnaghan], 
                                   generic_parameter=[Executable], 
                                   has_conditions=[lblBulk3DCrystal, lblAtomisticEnergyCalculator])                                                                                                                      

# DFT
EnergyCutoff = GenericParameter(name='EnergyCutoff', 
                                description='The cutoff on the number of plane wave functions being utilized as basis functions to represent the wavefunction')

VASP = Code(name='VASP', domain=[lblAtomistic, lblCode, lblDFT], 
            has_options=[lblBulk3DCrystal, lblAtomisticEnergyCalculator], 
            generic_parameter=[Executable])        
VASP_ENCUT = InputParameter(name='ENCUT', 
                            input_in=[VASP], 
                            generic_parameter=[EnergyCutoff], 
                            unit=['eV']) 
VASP_IBRAV = InputParameter(name='IBRAV', 
                            input_in=[VASP]) 
VASP_Structure = InputParameter(name=f'{VASP.name}/input/structure', 
                                mandatory_input_in=[VASP], 
                                generic_parameter=[AtomicStructure], 
                                has_transitive_conditions=[lblBulk3DCrystal])   

VASP_ETOT = OutputParameter(name='ETOT', output_of=[VASP]) 

# LAMMPS

LAMMPS = Code(name='LAMMPS', domain=[lblAtomistic, lblCode], 
            has_options=[lblBulk3DCrystal, lblAtomisticEnergyCalculator], 
            generic_parameter=[Executable])        

LAMMPS_Structure = InputParameter(name=f'{LAMMPS.name}/input/structure', 
                                mandatory_input_in=[LAMMPS], 
                                generic_parameter=[AtomicStructure], 
                                has_transitive_conditions=[lblBulk3DCrystal])   

LAMMPS_ETOT = OutputParameter(name='ETOT', output_of=[LAMMPS]) 

owl.close_world(PyObject) 
owl.sync_reasoner_pellet(infer_property_values = True, infer_data_property_values = True, debug=0)

onto.save()

def is_subset(a, b):
    return np.all([aa in b for aa in a])

In [4]:
onto_orig = owl.get_ontology("file://pyiron.owl").load()

#### A few code examples how to access ontology

In [5]:
individuals = list(onto_orig.individuals())

In [9]:
i = individuals[0]
ii = list(i.get_properties())[0]
ii.inverse_property, list(ii.get_relations())

(pyiron.is_in_domains,
 [(pyiron.Atomistic, pyiron.ChemicalElement),
  (pyiron.UserInput, pyiron.ChemicalElement),
  (pyiron.Atomistic, pyiron.AtomicStructure),
  (pyiron.Atomistic, pyiron.CreateStructureBulk),
  (pyiron.lCode, pyiron.CreateStructureBulk),
  (pyiron.Atomistic, pyiron.CreateSurface),
  (pyiron.lCode, pyiron.CreateSurface),
  (pyiron.MaterialProperty, pyiron.Bulk_modulus),
  (pyiron.MaterialProperty, pyiron.B_prime),
  (pyiron.Atomistic, pyiron.Murnaghan),
  (pyiron.lCode, pyiron.Murnaghan),
  (pyiron.Atomistic, pyiron.VASP),
  (pyiron.lCode, pyiron.VASP),
  (pyiron.DFT, pyiron.VASP),
  (pyiron.Atomistic, pyiron.LAMMPS),
  (pyiron.lCode, pyiron.LAMMPS)])

### Extract pandas Table and csv-file from ontology

In [14]:
inverse_list = ['has_objects', 'has_transitive_objects', 'has_conditional_objects', 
               'has_optional_objects', 'has_parameters', 'output','mandatory_input', 'input']

obj_lst = [] 
individuals = list(onto_orig.individuals())
for i in individuals:
    obj_dict = {}
    # print (i.is_instance_of[0], i.name)
    obj_dict['class'] = i.is_instance_of[0].name
    obj_dict['name'] = i.name
    for p in list(i.get_properties()):
        if p.python_name in inverse_list:
            continue
        # print ("   ", p.python_name, getattr(i, p.python_name))
        new_item_lst = []
        try:
            item_lst = getattr(i, p.python_name)
        except AttributeError:
            print(i, i.__dict__, p.python_name)
            raise
        for item in item_lst:
            if hasattr(item, 'name'):
                new_item_lst.append(item.name)
            else:
                new_item_lst.append(item)
                
        obj_dict[p.python_name] = new_item_lst
    obj_lst.append(obj_dict)
df = pandas.DataFrame(obj_lst)  

sorter = ['Label', 'GenericParameter', 'Code', 'InputParameter', 'OutputParameter']
df['class'] = pandas.Categorical(df['class'], sorter)
df = df.sort_values(by='class')
df.to_csv('pyiron_onto.csv', index=False)

df

pyiron.Bulk_modulus {'namespace': get_ontology("file://pyiron.owl#"), '_name': 'Bulk_modulus', '_equivalent_to': [], 'storid': 345, 'is_a': [pyiron.GenericParameter, pyiron.has_conditions.only(owl.Nothing), pyiron.has_transitive_conditions.only(owl.Nothing), pyiron.has_options.only(owl.Nothing), pyiron.is_in_domains.only(OneOf([pyiron.MaterialProperty])), pyiron.has_generic_parameter.only(owl.Nothing), pyiron.generic_parameter_has.only(OneOf([pyiron.Murnaghan/output/equilibrium_bulk_modulus]))], 'description': 'https://en.wikipedia.org/wiki/Bulk_modulus', 'has_conditions': [], 'has_transitive_conditions': [], 'has_options': [], 'domain': [pyiron.MaterialProperty], 'generic_parameter': [], 'has_parameters': [pyiron.Murnaghan/output/equilibrium_bulk_modulus]} unit


AttributeError: 'is_functional_for' annotation property is not defined.

## Read data from csv file

Define only classes and read the actual data from csv file. You can directly start here without running the first part.

In [None]:
onto = owl.get_ontology("file://pyiron_csv.owl")

with onto:
    class PyObject(owl.Thing):
        comment = 'my pyiron object'
    
    class Parameter(PyObject):
        @property
        def description(self):
            if len(self.generic_parameter) > 0:
                return self.generic_parameter[0].description
            
    class InputParameter(Parameter): 
        # @property
        def consistent_output(self, additional_conditions=None):
            if additional_conditions is None:
                conditions = self.has_conditions
            else:
                conditions = additional_conditions
            if len(conditions) > 0:    
                return [p for p in self.generic_parameter[0].has_parameters if is_subset(conditions, p.has_options)]
                # return list(set.intersection(*map(set,[i.has_optional_objects for i in conditions])))
            return self.generic_parameter[0].has_parameters

    # class MandatoryInputParameter(InputParameter): pass         
        
    class OutputParameter(Parameter): pass
    class GenericParameter(Parameter): 
        description = ""
          
    class Code(Parameter): pass

    class Label(PyObject): pass    
    
    class has_conditions(Parameter >> Label): pass
    class has_transitive_conditions(Parameter >> Label): pass  # condition to fulfill to fulfill option in code 
    class has_options(Parameter >> Label): pass

    class has_transitive_objects(Label >> Parameter):
        inverse_property = has_transitive_conditions     
        
    class has_conditional_objects(Label >> Parameter):
        inverse_property = has_conditions  
        
    class has_optional_objects(Label >> Parameter):
        inverse_property = has_options    
        
    class has_symbol(GenericParameter >> str):
        class_property_type = ["some"]
        python_name = "symbols" 
        
    class has_unit(Parameter >> str):
        class_property_type = ["some"]
        python_name = "unit"        
        
    class is_in_domains (Parameter >> Label): 
        class_property_type = ["some"]
        python_name = "domain" 
        
    class domain_has_codes(Label >> Parameter):
        python_name = "has_objects" 
        inverse_property = is_in_domains   
        
    class has_generic_parameter(Parameter >> GenericParameter):
        class_property_type = ["some"]
        python_name = "generic_parameter" 
        
    class generic_parameter_has(GenericParameter >> Parameter):
        python_name = "has_parameters" 
        inverse_property = has_generic_parameter        
        
    class is_input_of (InputParameter >> Code):
        class_property_type = ["some"]
        python_name = "input_in"  
        
    class has_input (Code >> InputParameter):
        python_name = "input" 
        inverse_property = is_input_of  
        
    class is_mandatory_input_of (InputParameter >> Code):
        class_property_type = ["some"]
        python_name = "mandatory_input_in"  
        
    class has_mandatory_input (Code >> InputParameter):
        python_name = "mandatory_input" 
        inverse_property = is_mandatory_input_of          
        
    class is_output_of (OutputParameter >> Code):
        class_property_type = ["some"]
        python_name = "output_of"      # python name has to be unique (even for different class)
        
    class has_output (Code >> OutputParameter):
        python_name = "output" 
        inverse_property = is_output_of          
        
    owl.AllDisjoint([InputParameter, OutputParameter, Label])

onto.save()

def is_subset(a, b):
    return np.all([aa in b for aa in a])

In [None]:
onto = owl.get_ontology("file://pyiron_csv.owl")
onto.load()

#### Load csv files as pandas table

In [None]:
# pandas.read_csv('parameters.csv', skipinitialspace=True,  delimiter=',').iloc[0].domain

In [None]:
eval(pandas.read_csv('pyiron_onto.csv', skipinitialspace=True,  delimiter=',').iloc[8].domain)

In [None]:
df_para = pandas.read_csv('pyiron_onto.csv', skipinitialspace=True,  delimiter=',')
df_para

In [None]:
import numpy as np

non_ontology_keys = ['symbols', 'unit']

def get_args(i_0, df):
    qwargs = {}
    # print ('class: ', df.iloc[i_0]['class'])  
    for key in df.keys():
        if key in ['class']:
            continue  
        val = df.iloc[i_0][key]
        if val is not np.nan:
            # print (key, val)
            if isinstance(val, str):
                val = val.strip()
                if key == 'comment':
                    val = val[2:-2]
                # print (key, val)
                if len(val) == 0:
                    continue
                elif val[0] == '[':  # list
                    val_lst = eval(val)
                    if key not in non_ontology_keys:
                        val_lst = [onto_labels[d.strip()] for d in val_lst]
                    qwargs[key] = val_lst
                else:       
                    qwargs[key] = val
                    
    return qwargs     

In [None]:
# get_args(17, df_para)

In [None]:
import pandas
import types

# df_labels = pandas.read_csv('label.csv', skipinitialspace=True)

onto_labels = owl.get_ontology("file://pyiron_labels.owl")
onto_labels.imported_ontologies.append(onto)

with onto_labels:
#     for index, row in df_labels.iterrows():
#         lbl = onto.Label(name=row['name'].strip())
#         if isinstance(row['comment'], str):
#             lbl.comment = row['comment'].strip()
        
    for index, row in df_para.iterrows():   
        if isinstance(row['class'], str):
            parent = onto[row['class']]
            # print (parent, row['name'])
            if parent is None:
                print ('Invalid class:', parent)
                continue

            qwargs = get_args(index, df_para)  
            print (row['class'], parent, qwargs)
            individuum = parent(**qwargs)  
            
    owl.close_world(PyObject) 
    owl.sync_reasoner_pellet(infer_property_values = True, infer_data_property_values = True, debug=0)                
onto_labels.save()        

### Read csv generated owl file

In [None]:
onto_labels = owl.get_ontology("file://pyiron_labels.owl").load()

### Explore ontology (collect/test some useful commands)

In [None]:
individuals = list(onto_labels.individuals())
individuals

In [None]:
onto_labels['CreateStructureBulk'].get_properties()

In [None]:
onto_labels.Bulk_modulus.has_parameters

In [None]:
onto_labels.get_imported_ontologies()

In [None]:
onto_labels.get_instances_of(onto.GenericParameter)

In [None]:
list(onto.classes())

In [None]:
list(onto.data_properties())

In [None]:
onto.Parameter.descendants()

In [None]:
# onto.Parameter.get_class_properties()

In [None]:
properties = list(onto.object_properties())
properties

In [None]:
onto_labels.DFT.is_a[0].instances()

In [None]:
onto_labels.Bulk3dStructure

In [None]:
onto_labels.get_imported_ontologies()

In [None]:
list(onto.classes())

In [None]:
onto.Label.instances()[-2].get_iri()

In [None]:
onto.base_iri

In [None]:
onto.Label.instances()

In [None]:
onto_labels.AtomisticEnergyCalculator.is_a[0].instances()