Merge pull request hubmapconsortium#81 from sennetconsortium/libpitt/…

…commons Libpitt/commons
sennetconsortium · Apr 17, 2023 · 77af795 · 77af795
2 parents 62a0b43 + 48e12ca
commit 77af795
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 47 deletions.
diff --git a/src/instance/app.cfg.example b/src/instance/app.cfg.example
@@ -55,6 +55,6 @@ INGEST_PIPELINE_DEFAULT_PROCESS='SCAN.AND.BEGIN.PROCESSING'
 
 UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
 UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
-UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":"C000008", "entities": "C000012", "source_types":"C050020", "data_types": "C004000"}'
+UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020", "assay_types":{"code": "C004000", "key": "datasets", "endpoint": "datasets?application_context=SENNET"}}'
 
 
diff --git a/src/lib/ontology.py b/src/lib/ontology.py
@@ -1,68 +1,107 @@
+import logging
+
 from atlas_consortia_commons.object import build_enum_class
 from atlas_consortia_commons.ubkg import get_from_node
+from atlas_consortia_commons.string import to_snake_case_upper, equals
+
 from flask import current_app
 
-def _get_obj_type(in_enum):
-    return 'enum' if in_enum else 'class'
+logger = logging.getLogger(__name__)
+
+
+def _get_obj_type(in_enum, as_data_dict: bool = False):
+    if as_data_dict:
+        return 'dict'
+    else:
+        return 'enum' if in_enum else 'class'
+
+
+def _get_response(obj):
+    if type(obj) is not str and get_from_node(obj, 'endpoint'):
+        return current_app.ubkg.get_ubkg_by_endpoint(obj)
+    else:
+        return current_app.ubkg.get_ubkg_valueset(obj)
+
+
+def _build_enum_class(name: str, obj, key: str = 'term', val_key: str = None, prop_callback=to_snake_case_upper,
+                      obj_type: str = 'class', data_as_val=False):
+    response = _get_response(obj)
+    return build_enum_class(name, response, key, val_key=val_key, prop_callback=prop_callback,
+                            obj_type=obj_type, data_as_val=data_as_val)
+
 
-def _build_enum_class(name: str, obj, key: str = 'term', in_enum: bool = False):
-    response = current_app.ubkg.get_ubkg_valueset(obj)
-    return build_enum_class(name, response, key, obj_type=_get_obj_type(in_enum))
+def entities(in_enum: bool = False, as_data_dict: bool = False):
+    return _build_enum_class('Entities', current_app.ubkg.entities, obj_type=_get_obj_type(in_enum, as_data_dict))
 
-def entities(in_enum: bool = False):
-    return _build_enum_class('Entities', current_app.ubkg.entities, in_enum=in_enum)
 
-def specimen_categories(in_enum: bool = False):
-    return _build_enum_class('SpecimenCategories', current_app.ubkg.specimen_categories, in_enum=in_enum)
+def specimen_categories(in_enum: bool = False, as_data_dict: bool = False):
+    return _build_enum_class('SpecimenCategories', current_app.ubkg.specimen_categories,
+                             obj_type=_get_obj_type(in_enum, as_data_dict))
 
-def organ_types(in_enum: bool = False):
-    return _build_enum_class('OrganTypes', current_app.ubkg.organ_types, in_enum=in_enum)
 
-def source_types(in_enum: bool = False):
-    return _build_enum_class('SourceTypes', current_app.ubkg.source_types, in_enum=in_enum)
+def organ_types(in_enum: bool = False, as_data_dict: bool = False):
+    return _build_enum_class('OrganTypes', current_app.ubkg.organ_types, key='rui_code', val_key='term',
+                             obj_type=_get_obj_type(in_enum, as_data_dict))
+
+
+def assay_types(in_enum: bool = False, as_data_dict: bool = False,
+                prop_callback=to_snake_case_upper, data_as_val=False):
+    return _build_enum_class('AssayTypes', current_app.ubkg.assay_types, key='data_type',
+                             obj_type=_get_obj_type(in_enum, as_data_dict),
+                             prop_callback=prop_callback, data_as_val=data_as_val)
+
+
+def source_types(in_enum: bool = False, as_data_dict: bool = False):
+    return _build_enum_class('SourceTypes', current_app.ubkg.source_types,
+                             obj_type=_get_obj_type(in_enum, as_data_dict))
 
-def data_types(in_enum: bool = False):
-    return _build_enum_class('DataTypes', current_app.ubkg.data_types, in_enum=in_enum)
 
 def init_ontology():
     specimen_categories()
     organ_types()
     entities()
+    assay_types()
     source_types()
-    data_types()
 
 def enum_val_lower(val):
     return val.value.lower()
 
+def ubkg_sever():
+    return current_app.config['UBKG_SERVER']
+
 def get_valueset_ep(code):
-    ep = f"{current_app.config['UBKG_SERVER']}{current_app.config['UBKG_ENDPOINT_VALUESET']}"
+    ep = f"{ubkg_sever()}{current_app.config['UBKG_ENDPOINT_VALUESET']}"
     return ep.format(code=code)
 
 def get_organ_types_ep():
-    return get_valueset_ep(get_from_node(current_app.ubkg.organ_types, 'code'))
+    return f"{ubkg_sever()}{get_from_node(current_app.ubkg.organ_types, 'endpoint')}"
+
+def get_assay_types_ep():
+    return f"{ubkg_sever()}{get_from_node(current_app.ubkg.assay_types, 'endpoint')}"
+
 
 class Ontology:
     @staticmethod
-    def entities(as_arr: bool = False, cb=str):
-        return Ontology._as_list_or_class(entities(as_arr), as_arr, cb)
+    def entities(as_arr: bool = False, cb=str, as_data_dict: bool = False):
+        return Ontology._as_list_or_class(entities(as_arr, as_data_dict), as_arr, cb)
 
     @staticmethod
-    def specimen_categories(as_arr: bool = False, cb=str):
-        return Ontology._as_list_or_class(specimen_categories(as_arr), as_arr, cb)
+    def assay_types(as_arr: bool = False, cb=str, as_data_dict: bool = False, prop_callback=to_snake_case_upper, data_as_val=False):
+        return Ontology._as_list_or_class(assay_types(as_arr, as_data_dict, prop_callback,
+                                                      data_as_val=data_as_val), as_arr, cb)
 
     @staticmethod
-    def organ_types(as_arr: bool = False, cb=str):
-        return Ontology._as_list_or_class(organ_types(as_arr), as_arr, cb)
+    def specimen_categories(as_arr: bool = False, cb=str, as_data_dict: bool = False):
+        return Ontology._as_list_or_class(specimen_categories(as_arr, as_data_dict), as_arr, cb)
 
     @staticmethod
-    def source_types(as_arr: bool = False, cb=str):
-        return Ontology._as_list_or_class(source_types(as_arr), as_arr, cb)
+    def organ_types(as_arr: bool = False, cb=str, as_data_dict: bool = False):
+        return Ontology._as_list_or_class(organ_types(as_arr, as_data_dict), as_arr, cb)
 
     @staticmethod
-    def data_types(as_arr: bool = False, cb=str):
-        return Ontology._as_list_or_class(data_types(as_arr), as_arr, cb)
+    def source_types(as_arr: bool = False, cb=str, as_data_dict: bool = False):
+        return Ontology._as_list_or_class(source_types(as_arr, as_data_dict), as_arr, cb)
 
     @staticmethod
     def _as_list_or_class(obj, as_arr: bool = False, cb=str):
-        return obj if not as_arr else list(map(cb, obj))
-
+        return obj if not as_arr else list(map(cb, obj))
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -10,4 +10,4 @@ requests==2.25.1
 # Default is main branch specified in docker-compose.development.yml if not set
 # git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
 hubmap-commons==2.1.3
-atlas-consortia-commons==1.0.2
+atlas-consortia-commons==1.0.3
diff --git a/src/routes/entity_CRUD/__init__.py b/src/routes/entity_CRUD/__init__.py
@@ -15,6 +15,7 @@
 from hubmap_commons import file_helper as commons_file_helper
 from atlas_consortia_commons.rest import *
 from atlas_consortia_commons.string import equals
+from atlas_consortia_commons.object import includes
 
 from lib.file_upload_helper import UploadFileHelper
 
@@ -27,7 +28,7 @@
 from routes.entity_CRUD.dataset_helper import DatasetHelper
 from routes.entity_CRUD.constraints_helper import *
 from routes.auth import get_auth_header
-from lib.ontology import Ontology, enum_val_lower
+from lib.ontology import Ontology, enum_val_lower, get_organ_types_ep, get_assay_types_ep
 from lib.file import get_csv_records, get_base_path, check_upload
 
 
@@ -688,8 +689,7 @@ def validate_samples(headers, records, header):
     SpecimenCategories = Ontology.specimen_categories()
     Entities = Ontology.entities()
 
-    with urllib.request.urlopen('https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml') as urlfile:
-        organ_resource_file = yaml.load(urlfile, Loader=yaml.FullLoader)
+    organ_types_codes = list(Ontology.organ_types(as_data_dict=True).keys())
 
     rownum = 0
     valid_ancestor_ids = []
@@ -749,6 +749,7 @@ def validate_samples(headers, records, header):
                 error_msg.append(_ln_err(f"can only be one of the following (not case sensitive): {', '.join(allowed_categories)}", rownum, "sample_category"))
 
             # validate organ_type
+            data_row['organ_type'] = data_row['organ_type'].upper()
             organ_type = data_row['organ_type']
             if not equals(sample_category, SpecimenCategories.ORGAN):
                 if len(organ_type) > 0:
@@ -759,9 +760,9 @@ def validate_samples(headers, records, header):
                     file_is_valid = False
                     error_msg.append(_ln_err("field is required if `sample_category` is `organ`", rownum, "organ_type"))
             if len(organ_type) > 0:
-                if organ_type.upper() not in organ_resource_file:
+                if organ_type not in organ_types_codes:
                     file_is_valid = False
-                    error_msg.append(_ln_err("value must be an organ code listed in `organ_type` files (https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml)", rownum, "organ_type"))
+                    error_msg.append(_ln_err(f"value must be an organ code listed at {get_organ_types_ep()}", rownum, "organ_type"))
 
             # validate ancestor_id
             ancestor_id = data_row['ancestor_id']
@@ -820,7 +821,6 @@ def validate_entity_constraints(file_is_valid, error_msg, header, entity_constra
 def validate_datasets(headers, records, header):
     error_msg = []
     file_is_valid = True
-    assays = []
 
     required_headers = ['ancestor_id', 'lab_id', 'doi_abstract', 'human_gene_sequences', 'data_types']
     for field in required_headers:
@@ -833,12 +833,8 @@ def validate_datasets(headers, records, header):
             file_is_valid = False
             error_msg.append(_common_ln_errs(2, field))
 
-    # retrieve yaml file containing all accepted data types
-    with urllib.request.urlopen('https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml') as urlfile:
-        assay_resource_file = yaml.load(urlfile, Loader=yaml.FullLoader)
 
-    for each in assay_resource_file:
-        assays.append(each.upper())
+    assay_types = list(Ontology.assay_types(as_data_dict=True, prop_callback=None).keys())
 
     rownum = 0
     entity_constraint_list = []
@@ -885,15 +881,20 @@ def validate_datasets(headers, records, header):
             # validate data_type
             data_types = data_row['data_types']
             data_types_valid = True
-            for data_type in data_types:
-                if data_type.upper() not in assays:
+            for i, data_type in enumerate(data_types):
+                idx = includes(assay_types, data_type, single_index=True)
+
+                if idx == -1:
                     file_is_valid = False
                     data_types_valid = False
-                    error_msg.append(_ln_err("value must be an assay type listed in assay type files (https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml)", rownum, "data_types"))
+                    error_msg.append(_ln_err(f"value must be an assay type listed at {get_assay_types_ep()}", rownum, "data_types"))
+                else:
+                    # apply formatting
+                    data_types[i] = assay_types[idx]
 
             if len(data_types) < 1:
                 file_is_valid = False
-                error_msg.append(_ln_err("must not be empty. Must contain an assay type listed in https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml", rownum, "data_types"))
+                error_msg.append(_ln_err(f"must not be empty. Must contain an assay type listed at {get_assay_types_ep()}", rownum, "data_types"))
 
             # validate ancestor_id
             ancestor_ids = data_row['ancestor_id']