Skip to content

Commit

Permalink
Merge pull request hubmapconsortium#81 from sennetconsortium/libpitt/…
Browse files Browse the repository at this point in the history
…commons

Libpitt/commons
  • Loading branch information
maxsibilla committed Apr 17, 2023
2 parents 62a0b43 + 48e12ca commit 77af795
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 47 deletions.
2 changes: 1 addition & 1 deletion src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,6 @@ INGEST_PIPELINE_DEFAULT_PROCESS='SCAN.AND.BEGIN.PROCESSING'

UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":"C000008", "entities": "C000012", "source_types":"C050020", "data_types": "C004000"}'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020", "assay_types":{"code": "C004000", "key": "datasets", "endpoint": "datasets?application_context=SENNET"}}'


99 changes: 69 additions & 30 deletions src/lib/ontology.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,107 @@
import logging

from atlas_consortia_commons.object import build_enum_class
from atlas_consortia_commons.ubkg import get_from_node
from atlas_consortia_commons.string import to_snake_case_upper, equals

from flask import current_app

def _get_obj_type(in_enum):
return 'enum' if in_enum else 'class'
logger = logging.getLogger(__name__)


def _get_obj_type(in_enum, as_data_dict: bool = False):
if as_data_dict:
return 'dict'
else:
return 'enum' if in_enum else 'class'


def _get_response(obj):
if type(obj) is not str and get_from_node(obj, 'endpoint'):
return current_app.ubkg.get_ubkg_by_endpoint(obj)
else:
return current_app.ubkg.get_ubkg_valueset(obj)


def _build_enum_class(name: str, obj, key: str = 'term', val_key: str = None, prop_callback=to_snake_case_upper,
obj_type: str = 'class', data_as_val=False):
response = _get_response(obj)
return build_enum_class(name, response, key, val_key=val_key, prop_callback=prop_callback,
obj_type=obj_type, data_as_val=data_as_val)


def _build_enum_class(name: str, obj, key: str = 'term', in_enum: bool = False):
response = current_app.ubkg.get_ubkg_valueset(obj)
return build_enum_class(name, response, key, obj_type=_get_obj_type(in_enum))
def entities(in_enum: bool = False, as_data_dict: bool = False):
return _build_enum_class('Entities', current_app.ubkg.entities, obj_type=_get_obj_type(in_enum, as_data_dict))

def entities(in_enum: bool = False):
return _build_enum_class('Entities', current_app.ubkg.entities, in_enum=in_enum)

def specimen_categories(in_enum: bool = False):
return _build_enum_class('SpecimenCategories', current_app.ubkg.specimen_categories, in_enum=in_enum)
def specimen_categories(in_enum: bool = False, as_data_dict: bool = False):
return _build_enum_class('SpecimenCategories', current_app.ubkg.specimen_categories,
obj_type=_get_obj_type(in_enum, as_data_dict))

def organ_types(in_enum: bool = False):
return _build_enum_class('OrganTypes', current_app.ubkg.organ_types, in_enum=in_enum)

def source_types(in_enum: bool = False):
return _build_enum_class('SourceTypes', current_app.ubkg.source_types, in_enum=in_enum)
def organ_types(in_enum: bool = False, as_data_dict: bool = False):
return _build_enum_class('OrganTypes', current_app.ubkg.organ_types, key='rui_code', val_key='term',
obj_type=_get_obj_type(in_enum, as_data_dict))


def assay_types(in_enum: bool = False, as_data_dict: bool = False,
prop_callback=to_snake_case_upper, data_as_val=False):
return _build_enum_class('AssayTypes', current_app.ubkg.assay_types, key='data_type',
obj_type=_get_obj_type(in_enum, as_data_dict),
prop_callback=prop_callback, data_as_val=data_as_val)


def source_types(in_enum: bool = False, as_data_dict: bool = False):
return _build_enum_class('SourceTypes', current_app.ubkg.source_types,
obj_type=_get_obj_type(in_enum, as_data_dict))

def data_types(in_enum: bool = False):
return _build_enum_class('DataTypes', current_app.ubkg.data_types, in_enum=in_enum)

def init_ontology():
specimen_categories()
organ_types()
entities()
assay_types()
source_types()
data_types()

def enum_val_lower(val):
return val.value.lower()

def ubkg_sever():
return current_app.config['UBKG_SERVER']

def get_valueset_ep(code):
ep = f"{current_app.config['UBKG_SERVER']}{current_app.config['UBKG_ENDPOINT_VALUESET']}"
ep = f"{ubkg_sever()}{current_app.config['UBKG_ENDPOINT_VALUESET']}"
return ep.format(code=code)

def get_organ_types_ep():
return get_valueset_ep(get_from_node(current_app.ubkg.organ_types, 'code'))
return f"{ubkg_sever()}{get_from_node(current_app.ubkg.organ_types, 'endpoint')}"

def get_assay_types_ep():
return f"{ubkg_sever()}{get_from_node(current_app.ubkg.assay_types, 'endpoint')}"


class Ontology:
@staticmethod
def entities(as_arr: bool = False, cb=str):
return Ontology._as_list_or_class(entities(as_arr), as_arr, cb)
def entities(as_arr: bool = False, cb=str, as_data_dict: bool = False):
return Ontology._as_list_or_class(entities(as_arr, as_data_dict), as_arr, cb)

@staticmethod
def specimen_categories(as_arr: bool = False, cb=str):
return Ontology._as_list_or_class(specimen_categories(as_arr), as_arr, cb)
def assay_types(as_arr: bool = False, cb=str, as_data_dict: bool = False, prop_callback=to_snake_case_upper, data_as_val=False):
return Ontology._as_list_or_class(assay_types(as_arr, as_data_dict, prop_callback,
data_as_val=data_as_val), as_arr, cb)

@staticmethod
def organ_types(as_arr: bool = False, cb=str):
return Ontology._as_list_or_class(organ_types(as_arr), as_arr, cb)
def specimen_categories(as_arr: bool = False, cb=str, as_data_dict: bool = False):
return Ontology._as_list_or_class(specimen_categories(as_arr, as_data_dict), as_arr, cb)

@staticmethod
def source_types(as_arr: bool = False, cb=str):
return Ontology._as_list_or_class(source_types(as_arr), as_arr, cb)
def organ_types(as_arr: bool = False, cb=str, as_data_dict: bool = False):
return Ontology._as_list_or_class(organ_types(as_arr, as_data_dict), as_arr, cb)

@staticmethod
def data_types(as_arr: bool = False, cb=str):
return Ontology._as_list_or_class(data_types(as_arr), as_arr, cb)
def source_types(as_arr: bool = False, cb=str, as_data_dict: bool = False):
return Ontology._as_list_or_class(source_types(as_arr, as_data_dict), as_arr, cb)

@staticmethod
def _as_list_or_class(obj, as_arr: bool = False, cb=str):
return obj if not as_arr else list(map(cb, obj))

return obj if not as_arr else list(map(cb, obj))
2 changes: 1 addition & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ requests==2.25.1
# Default is main branch specified in docker-compose.development.yml if not set
# git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
hubmap-commons==2.1.3
atlas-consortia-commons==1.0.2
atlas-consortia-commons==1.0.3
31 changes: 16 additions & 15 deletions src/routes/entity_CRUD/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from hubmap_commons import file_helper as commons_file_helper
from atlas_consortia_commons.rest import *
from atlas_consortia_commons.string import equals
from atlas_consortia_commons.object import includes

from lib.file_upload_helper import UploadFileHelper

Expand All @@ -27,7 +28,7 @@
from routes.entity_CRUD.dataset_helper import DatasetHelper
from routes.entity_CRUD.constraints_helper import *
from routes.auth import get_auth_header
from lib.ontology import Ontology, enum_val_lower
from lib.ontology import Ontology, enum_val_lower, get_organ_types_ep, get_assay_types_ep
from lib.file import get_csv_records, get_base_path, check_upload


Expand Down Expand Up @@ -688,8 +689,7 @@ def validate_samples(headers, records, header):
SpecimenCategories = Ontology.specimen_categories()
Entities = Ontology.entities()

with urllib.request.urlopen('https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml') as urlfile:
organ_resource_file = yaml.load(urlfile, Loader=yaml.FullLoader)
organ_types_codes = list(Ontology.organ_types(as_data_dict=True).keys())

rownum = 0
valid_ancestor_ids = []
Expand Down Expand Up @@ -749,6 +749,7 @@ def validate_samples(headers, records, header):
error_msg.append(_ln_err(f"can only be one of the following (not case sensitive): {', '.join(allowed_categories)}", rownum, "sample_category"))

# validate organ_type
data_row['organ_type'] = data_row['organ_type'].upper()
organ_type = data_row['organ_type']
if not equals(sample_category, SpecimenCategories.ORGAN):
if len(organ_type) > 0:
Expand All @@ -759,9 +760,9 @@ def validate_samples(headers, records, header):
file_is_valid = False
error_msg.append(_ln_err("field is required if `sample_category` is `organ`", rownum, "organ_type"))
if len(organ_type) > 0:
if organ_type.upper() not in organ_resource_file:
if organ_type not in organ_types_codes:
file_is_valid = False
error_msg.append(_ln_err("value must be an organ code listed in `organ_type` files (https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml)", rownum, "organ_type"))
error_msg.append(_ln_err(f"value must be an organ code listed at {get_organ_types_ep()}", rownum, "organ_type"))

# validate ancestor_id
ancestor_id = data_row['ancestor_id']
Expand Down Expand Up @@ -820,7 +821,6 @@ def validate_entity_constraints(file_is_valid, error_msg, header, entity_constra
def validate_datasets(headers, records, header):
error_msg = []
file_is_valid = True
assays = []

required_headers = ['ancestor_id', 'lab_id', 'doi_abstract', 'human_gene_sequences', 'data_types']
for field in required_headers:
Expand All @@ -833,12 +833,8 @@ def validate_datasets(headers, records, header):
file_is_valid = False
error_msg.append(_common_ln_errs(2, field))

# retrieve yaml file containing all accepted data types
with urllib.request.urlopen('https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml') as urlfile:
assay_resource_file = yaml.load(urlfile, Loader=yaml.FullLoader)

for each in assay_resource_file:
assays.append(each.upper())
assay_types = list(Ontology.assay_types(as_data_dict=True, prop_callback=None).keys())

rownum = 0
entity_constraint_list = []
Expand Down Expand Up @@ -885,15 +881,20 @@ def validate_datasets(headers, records, header):
# validate data_type
data_types = data_row['data_types']
data_types_valid = True
for data_type in data_types:
if data_type.upper() not in assays:
for i, data_type in enumerate(data_types):
idx = includes(assay_types, data_type, single_index=True)

if idx == -1:
file_is_valid = False
data_types_valid = False
error_msg.append(_ln_err("value must be an assay type listed in assay type files (https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml)", rownum, "data_types"))
error_msg.append(_ln_err(f"value must be an assay type listed at {get_assay_types_ep()}", rownum, "data_types"))
else:
# apply formatting
data_types[i] = assay_types[idx]

if len(data_types) < 1:
file_is_valid = False
error_msg.append(_ln_err("must not be empty. Must contain an assay type listed in https://raw.githubusercontent.com/sennetconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml", rownum, "data_types"))
error_msg.append(_ln_err(f"must not be empty. Must contain an assay type listed at {get_assay_types_ep()}", rownum, "data_types"))

# validate ancestor_id
ancestor_ids = data_row['ancestor_id']
Expand Down

0 comments on commit 77af795

Please sign in to comment.