# Loading the controlled vocabulary

Load the current vocabulary, `HAWC-Ontologies-July2020v2.xlsx`, into HAWC.

In [None]:
from pathlib import Path
import pandas as pd
import django

In [None]:
django.setup()

In [None]:
from hawc.apps.vocab.models import Term, VocabularyNamespace, VocabularyTermType

In [None]:
columns = [
    "endpoint-name",
    "endpoint-system",
    "endpoint-organ",
    "endpoint-effect",
    "endpoint-effect_subtype",
    "Comment"
]
df = pd.read_excel(
    Path('~/Desktop/HAWC-Ontologies-July2020v2.xlsx').expanduser(), 
    sheet_name="Preferred Terms List-July 2020").fillna("")[columns]
df.head()

The spreadsheet is messy, sometimes there are spaces after terms. We'll cleanup:

In [None]:
df["endpoint-name"] = df["endpoint-name"].str.strip()
df["endpoint-system"] = df["endpoint-system"].str.strip()
df["endpoint-organ"] = df["endpoint-organ"].str.strip()
df["endpoint-effect"] = df["endpoint-effect"].str.strip()
df["endpoint-effect_subtype"] = df["endpoint-effect_subtype"].str.strip()

## V1 import (not used)

This was the first approach, now commented out for saving. This kept the terms unique which is ideal for a term list, but that makes the entitiy mapping much more difficult. The reason is, a particualr `effect` or `effect_subtype` in the context of a given system will have a different annotations applied to it, and this reduction of terms would prevent that distinction.

```python
# start fresh
TermRelation.objects.all().delete()
Term.objects.all().delete()

# build default lookup map
items = {
    key: {} for (key, value) in VocabularyTermType.choices()

}

for system in df['endpoint-system'].unique():
    s1 = df.query(f'`endpoint-system`=="{system}"')
    for organ in s1['endpoint-organ'].unique():
        s2 = s1.query(f'`endpoint-organ`=="{organ}"')
        for effect in s2['endpoint-effect'].unique():
            s3 = s2.query(f'`endpoint-effect`=="{effect}"')
            for effect_subtype in s3['endpoint-effect_subtype'].unique():
                s4 = s3.query(f'`endpoint-effect_subtype`=="{effect_subtype}"')
                for name in s4['endpoint-name'].unique():
            
                    # system
                    if system not in items[VocabularyTermType.system]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.system,
                            name=system
                        )
                        items[VocabularyTermType.system][system] = obj.id
                    
                    # organ
                    if organ not in items[VocabularyTermType.organ]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.organ,
                            name=organ
                        )
                        obj.parents.add(items[VocabularyTermType.system][system])
                        items[VocabularyTermType.organ][organ] = obj.id
                        
                    # effect
                    if effect not in items[VocabularyTermType.effect]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.effect,
                            name=effect
                        )
                        obj.parents.add(items[VocabularyTermType.organ][organ])
                        items[VocabularyTermType.effect][effect] = obj.id
                    
                    # effect_subtype
                    if effect_subtype not in items[VocabularyTermType.effect_subtype]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.effect_subtype,
                            name=effect_subtype
                        )
                        obj.parents.add(items[VocabularyTermType.effect][effect])
                        items[VocabularyTermType.effect_subtype][effect_subtype] = obj.id
                    
                    # name
                    if name not in items[VocabularyTermType.endpoint_name]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.endpoint_name,
                            name=name
                        )
                        obj.parents.add(items[VocabularyTermType.effect_subtype][effect_subtype])
                        items[VocabularyTermType.endpoint_name][name] = obj.id
```

In [None]:
Term.objects.count()

## V2 import

This creates many more terms and doesn't maintain term uniqueness, but it has the advantage that entities can be associated with a particular term in the context of a system, organ, effect, etc.

If after more exploration is done it is determined that this is appropraite, the join-table for parents can be removed and it can be a ForeignKey field instead of a ManyToMany.

In [None]:
# start fresh
Term.objects.all().delete()

# build default lookup map
items = {
    key: {} for (key, value) in VocabularyTermType.choices()

}

for system in df['endpoint-system'].unique():
    s1 = df.query(f'`endpoint-system`=="{system}"')
    for organ in s1['endpoint-organ'].unique():
        s2 = s1.query(f'`endpoint-organ`=="{organ}"')
        for effect in s2['endpoint-effect'].unique():
            s3 = s2.query(f'`endpoint-effect`=="{effect}"')
            for effect_subtype in s3['endpoint-effect_subtype'].unique():
                s4 = s3.query(f'`endpoint-effect_subtype`=="{effect_subtype}"')
                for name in s4['endpoint-name'].unique():

                    # system
                    system_key = system
                    if system_key not in items[VocabularyTermType.system]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.system,
                            name=system,
                            parent_id=None,
                        )
                        items[VocabularyTermType.system][system_key] = obj.id

                    # organ
                    organ_key = (system, organ)
                    if organ_key not in items[VocabularyTermType.organ]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.organ,
                            name=organ,
                            parent_id=items[VocabularyTermType.system][system_key],
                        )
                        items[VocabularyTermType.organ][organ_key] = obj.id

                    # effect
                    effect_key = (system, organ, effect)
                    if effect_key not in items[VocabularyTermType.effect]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.effect,
                            name=effect,
                            parent_id=items[VocabularyTermType.organ][organ_key],
                        )
                        items[VocabularyTermType.effect][effect_key] = obj.id

                    # effect_subtype
                    effect_subtype_key = (system, organ, effect, effect_subtype)
                    if effect_subtype_key not in items[VocabularyTermType.effect_subtype]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.effect_subtype,
                            name=effect_subtype,
                            parent_id=items[VocabularyTermType.effect][effect_key],
                        )
                        items[VocabularyTermType.effect_subtype][effect_subtype_key] = obj.id

                    # name
                    name_key = (system, organ, effect, effect_subtype, name)
                    if name_key not in items[VocabularyTermType.endpoint_name]:
                        obj = Term.objects.create(
                            namespace=VocabularyNamespace.EHV,
                            type=VocabularyTermType.endpoint_name,
                            name=name,
                            parent_id=items[VocabularyTermType.effect_subtype][effect_subtype_key],
                        )
                        items[VocabularyTermType.endpoint_name][name_key] = obj.id
                        
Term.objects.count()