# Using Scispacy to Link Terms to UMLS Entities

This notebook gives an overview of the script to link terms in HAWC to UMLS entities. Run the block below to setup all of the utility functions. This is a verbatim copy of `utils.py`.

You can use the following commands to setup an example environment using conda:

1. `conda create --name umls python=3.7 numpy=1.18 scikit-learn=0.20.3 pandas xlrd joblib scipy spacy lxml`
2. `conda activate umls`
3. `pip install --no-binary :all: nmslib` (optional)
4. `pip install scispacy`
5. `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz`


In [1]:
# -*- coding: utf-8 -*-
"""Utils.

@author: scott
"""

import spacy
from scispacy.linking import EntityLinker


_cache = {}


def loadUMLSLinker(max_UMLS_Returns=999, confidence=0.5,
                   model="en_core_sci_sm"):
    """Load model."""
    nlp = spacy.load(model)
    linker = EntityLinker(resolve_abbreviations=False, name="umls",
                          max_entities_per_mention=max_UMLS_Returns,
                          threshold=confidence)
    nlp.add_pipe(linker)
    return (linker, nlp)


def loadDocs(series, nlp):
    """Run nlp on text."""
    series_list = series.drop_duplicates().apply(transformComma, entity=True)
    text_list = set([j.strip() for i in series_list for j in i if len(j) > 2])
    docs_dict = {text: nlp(text) for text in text_list}
    return docs_dict


def transformComma(text: str, entity: bool) -> list:
    """Remove commas and reverse string."""
    if text.endswith('.'):
        return [text]
    text_split = [str(i).strip() for i in text.split(",")]
    text_flipped = [str(i).strip() for i in text_split[::-1]]
    comma_removed = " ".join(text_split)
    comma_reversed = " ".join(text_flipped)
    comb = [comma_reversed, comma_removed]
    comb.append(text)
    if entity:
        comb += text_split[:1]
    return comb


def umls_filter(x, string=True) -> str:
    """Convert umls list of dicts to string."""
    text = x.iloc[0]
    umlsList = x.iloc[1]

    for i in range(2, len(x.index)):
        umlsList_temp = x.iloc[i]
        for i in umlsList_temp:
            if i['cui'] not in [j['cui'] for j in umlsList]:
                umlsList.append(i)

    base = '' if string else []
    if len(umlsList) == 0:
        return base

    text_split = [str(i).strip().lower() for i in text.split(",")
                  if len(i.strip()) > 2]
    text_transformed = [i.lower() for i in transformComma(text, False)]
    tiers = [[] for i in range(len(text_split) + 2)]

    for val in umlsList:
        syns = [i.lower() for i in val['synonyms']] + [val['name'].lower()]
        added = False

        for i in text_transformed:
            if i in syns:
                tiers[0].append(val)
                added = True
                break
        if added:
            continue

        for n, i in enumerate(text_split):
            if i in syns:
                tiers[n+1].append(val)
                added = True
                break
        if added:
            continue

        tiers[-1].append(val)

    results_list = []
    for n, i in enumerate(tiers):
        if n == len(tiers)-1 and len(results_list) > 0:
            break
        if len(i) == 0:
            continue
        results_list += sorted(i, key=lambda x: x['score'], reverse=True)
        if n == 0:
            break

    if string:
        return ' // '.join(
            [i['name'] + ' (' + i['cui'] + '; ' + ', '.join(i['tuis']) + ')'
             for i in results_list])
    else:
        return results_list


def breakIntoSpansAndUMLS(text, docs_dict, linker,
                          tuiFilter=None,
                          confidence=0.5,
                          RequireNonObsoleteDef=False,
                          entity_break=False,
                          cui_limit=5,
                          cache_key=None):
    """Read entities and assign cui."""
    if cache_key is not None:
        if cache_key not in _cache:
            _cache[cache_key] = {}
        elif text in _cache[cache_key]:
            return _cache[cache_key][text]
    confidence = 0 if confidence is None else confidence
    cui_limit = 999 if cui_limit is None else cui_limit
    docs = [docs_dict[text_transformed.strip()]
            for text_transformed in transformComma(text, entity_break)
            if len(text_transformed) > 2]
    if entity_break:
        entities = [i for doc in docs for i in list(doc.ents)]
    else:
        entities = [doc[:] for doc in docs]
    save_cui = {}
    for entity in entities:
        last_score = 1
        count = 0
        for umls_ent in entity._.umls_ents:
            umls_score = round(umls_ent[1], 4)
            if umls_ent[0] in save_cui:
                if save_cui[umls_ent[0]]['score'] < umls_score:
                    save_cui[umls_ent[0]]['score'] = umls_score
                continue
            if umls_score < confidence:
                break
            if count >= cui_limit and umls_score < last_score:
                break
            umls_Code = linker.umls.cui_to_entity[umls_ent[0]]
            TUI = umls_Code[3]
            UMLS_Def = umls_Code[4]
            if RequireNonObsoleteDef and \
                    (UMLS_Def is None or "OBSOLETE" in UMLS_Def):
                continue
            if tuiFilter is None or [i for i in TUI if i in tuiFilter]:
                umls_dict = {'name': umls_Code[1],
                             'cui': umls_ent[0],
                             'score': umls_score,
                             'synonyms': umls_Code[2],
                             'tuis': umls_Code[3],
                             'description': umls_Code[4],
                             }
                count += 1
                last_score = umls_score
                save_cui[umls_ent[0]] = umls_dict
    umlsList = [val for key, val in save_cui.items()]

    umls_sorted = sorted(umlsList, key=lambda x: x['score'], reverse=True)
    umls_limited = [i for i in umls_sorted if i['score'] >=
                    umls_sorted[min(cui_limit,
                                    len(umls_sorted))-1]['score']]
    umlsList = umls_limited

    _cache[cache_key][text] = umlsList

    return umlsList


Now we can start linking terms. First, load a DataFrame of terms.

In [2]:
import pandas as pd
df = pd.read_excel("HAWC-Ontologies-July2020v2.xlsx",
                   sheet_name="Preferred Terms List-July 2020",
                   usecols="A:C")

Now we have to load the models. If this is the first time you run the command, a bunch of data will need to be downloaded. You can use the parameters to change which scispacy model you want loaded, as well as info like the minimum confidence interval. Loading the models will take about a minute once everyting is downloaded, but you'll only need to do it once. I would highly recommend using a computer with 32 GB of memory for this, although you can probably get by with 16 if you close Chrome.

In [3]:
linker, nlp = loadUMLSLinker()

We can also feed all of the text into the model beforehand to speed things up. The input is a series which contains every string that you want to map to a UMLS entity. This method can be easily multithreaded if you have more than a few thousand terms, otherwise it's probably not worth it. Feeding in a series with around 3,000 unique strings took about a minute to run in testing.

In [4]:
docs = loadDocs(pd.concat(
    [df['endpoint-organ'], df['endpoint-system'], df['endpoint-name']]),
    nlp)

All of the computationally heavy steps are already complete. From this point on, every step should be pretty much instantaneous, including the tiered analysis. The code below is an example of how to match terms. In this example, the `endpoint-organ` column is fed into the mapping method, with the output being a new series. `linker`, `docs`, and `nlp` need to be fed into the method along with the series. There are a few other parameters you can add:

- `max_results`: Maximum number of results to return for each entity found in the string. Default is 10.
- `confidence`: Minimum confidence score for a result to be returned. Default is 0.5.
- `tuis`: An iterable containing valid TUIs. Default is `None` (all TUIs valid).
- `entity_break`: Bool. If `false`, the entire search string must match an entity. If `true`, entities can be found on substrings.
- `RequireNonObsoleteDef`: Bool. Whether to allow abselete entities (based on definition).

In [5]:
def run_tier(series: pd.Series, linker, docs_dict, nlp,
             max_results: int = 10,
             confidence: float = 0.5, tuis=None, entity_break=False,
             RequireNonObsoleteDef=False):
    """Get entity from series."""
    cache_key = hash(
        (nlp,
         linker,
         max_results if max_results is not None else '',
         confidence if confidence is not None else '',
         ''.join(tuis) if tuis is not None else '',
         entity_break, RequireNonObsoleteDef,))
    new_series = series.apply(breakIntoSpansAndUMLS,
                              docs_dict=docs_dict,
                              linker=linker,
                              tuiFilter=tuis,
                              RequireNonObsoleteDef=RequireNonObsoleteDef,
                              confidence=confidence,
                              entity_break=entity_break,
                              cui_limit=max_results,
                              cache_key=cache_key,
                              )
    return new_series

df["endpoint-organ_UMLS"] = \
    run_tier(df["endpoint-organ"], linker, docs, nlp,
             1, 0.5)
print(df["endpoint-organ_UMLS"].head())

0    [{'name': 'Serum', 'cui': 'C0229671', 'score':...
1    [{'name': 'Serum', 'cui': 'C0229671', 'score':...
2    [{'name': 'Blood supply aspects', 'cui': 'C000...
3    [{'name': 'Heart', 'cui': 'C0018787', 'score':...
4    [{'name': 'Heart', 'cui': 'C0018787', 'score':...
Name: endpoint-organ_UMLS, dtype: object


There is another method which helps organize the outputs. It can help to filter out some bad outputs. `run_tier` returns a list of UMLS entities for each entry. This filtering function can also convert this list into a string by setting `string` to `True`. The DataFrame that the filter function gets applied to should have at least 2 columns. The first one should be original strings, and the second one should be the list of UMLS entities. More columns of UMLS entities can be added (they will be combined with the second column).

If you want to limit the maximum number of results that are returned (e.g. to 1), this would be the place to add that functionality (rather than above when calling `run_tier`). A list or string is returned by this function. In general, the position in that list returned by this function indicates the quality of the match. That is to say, the first result is the best one. This isn't 100% true (since two matches can be basically tied in importance with how the code is written right now) but is a good guideline.

In [6]:
df['endpoint-organ_UMLS'] = pd.concat(
    [df["endpoint-organ"], df["endpoint-organ_UMLS"]], axis=1) \
    .apply(umls_filter, axis=1, result_type='reduce', string=True)
print(df["endpoint-organ_UMLS"].head())

0    Serum (C0229671; T031) // Specimen Type - Seru...
1    Serum (C0229671; T031) // Specimen Type - Seru...
2    Blood supply aspects (C0005839; T080) // Vascu...
3                               Heart (C0018787; T023)
4                               Heart (C0018787; T023)
Name: endpoint-organ_UMLS, dtype: object


Finally, the other thing we can do is run a tiered analysis, where we run `run_tier` with different parameters that have descending strictness. Here is an example of that. You can change the parameters of each tier based on the dataset, or add and remove tiers. All of the indexing allows the function to stop running on a value once it finds a match.

This specific function has 5 tiers, with descending minimum scores. The last tier also allows sets `entity_break=True`, and runs on every value even if a match was already found.

In [7]:
class TuiFilters:
    """Tui list for rescricting mappings."""

    tuis_3 = {
        "T053", "T054", "T055", "T017", "T018", "T021", "T022", "T023", "T024",
        "T025", "T026", "T029", "T030", "T031", "T109", "T114", "T116", "T121",
        "T123", "T125", "T126", "T127", "T129", "T131", "T192", "T196", "T079",
        "T080", "T081", "T082", "T102", "T169", "T185", "T034", "T038", "T032",
        "T039", "T040", "T041", "T042", "T043", "T044", "T045", "T201", "T019",
        "T020", "T033", "T037", "T046", "T047", "T048", "T049", "T050", "T184",
        "T190", "T191", "T059",
    }
    tuis = {
        "T053", "T054", "T055", "T017", "T018", "T021", "T022", "T023", "T024",
        "T025", "T026", "T029", "T030", "T031", "T109", "T114", "T116", "T121",
        "T123", "T125", "T126", "T127", "T129", "T131", "T192", "T196", "T079",
        "T080", "T081", "T082", "T102", "T169", "T185", "T034", "T038", "T032",
        "T039", "T040", "T041", "T042", "T043", "T044", "T045", "T201", "T019",
        "T020", "T033", "T037", "T046", "T047", "T048", "T049", "T050", "T184",
        "T190", "T191",
    }

def run_tiered_analysis(
        series: pd.Series, linker, nlp, docs, max_results=1, force_limit=True,
        ) -> pd.DataFrame:
    """Run a tiered analysis on endpoint-name."""
    cui_limit = 10  # don't change this

    print(f"Running tier 1 on {series.name}")
    t1 = run_tier(
        series, linker, docs, nlp, cui_limit, 0.98, TuiFilters.tuis,
        RequireNonObsoleteDef=True)
    nfound = len(t1.loc[t1.map(len) != 0])
    print(f'{nfound} names mapped')

    print(f"Running tier 2 on {series.name}")
    t2 = run_tier(
        series.loc[t1.loc[t1.map(len) == 0].index], linker, docs, nlp,
        cui_limit, 0.85, TuiFilters.tuis)
    nfound = len(t2.loc[t2.map(len) != 0])
    print(f'{nfound} names mapped')

    print(f"Running tier 3 on {series.name}")
    t3 = run_tier(
        series.loc[t2.loc[t2.map(len) == 0].index], linker, docs, nlp,
        cui_limit, 0.70, TuiFilters.tuis_3)
    nfound = len(t3.loc[t3.map(len) != 0])
    print(f'{nfound} names mapped')

    print(f"Running tier 4 on {series.name}")
    t4 = run_tier(
        series.loc[t3.loc[t3.map(len) == 0].index], linker, docs, nlp,
        cui_limit, 0.85, None)
    nfound = len(t4.loc[t4.map(len) != 0])
    print(f'{nfound} names mapped')

    print(f"Running tier 7 on {series.name}")
    t7 = run_tier(
        series.loc[t4.loc[t4.map(len) == 0].index], linker, docs, nlp,
        cui_limit, 0.85, TuiFilters.tuis, entity_break=True)
    nfound = len(t7.loc[t7.map(len) != 0])
    print(f'{nfound} names mapped')

    print(f"Running tier 8 on {series.name}")
    # this tier will run for value, unlike the ones above
    t8 = run_tier(series, linker, docs, nlp,
                  cui_limit, 0.85, TuiFilters.tuis, entity_break=True)
    t8.name = series.name + 'Tier8'

    comb_tiers = pd.concat([pd.concat(
                                [t1, pd.Series('Tier 1', index=t1.index)],
                                axis=1).loc[t1.map(len) > 0],
                            pd.concat(
                                [t2, pd.Series('Tier 2', index=t2.index)],
                                axis=1).loc[t2.map(len) > 0],
                            pd.concat(
                                [t3, pd.Series('Tier 3', index=t3.index)],
                                axis=1).loc[t3.map(len) > 0],
                            pd.concat(
                                [t4, pd.Series('Tier 4', index=t4.index)],
                                axis=1).loc[t4.map(len) > 0],
                            pd.concat(
                                [t7, pd.Series('Tier 7', index=t7.index)],
                                axis=1).loc[t7.map(len) > 0],
                            pd.concat(
                                [t7, pd.Series('', index=t7.index)],
                                axis=1).loc[t7.map(len) == 0],
                            ]).sort_index()
    comb_tiers.columns = [series.name + '_UMLS', series.name + '_Tier']
    # df_new = pd.concat([series, comb_tiers], axis=1)
    df_new = comb_tiers

    notfound = len(t7.loc[t7.map(len) == 0])

    df_new[series.name + '_UMLS'] = pd.concat(
        [series, df_new[series.name + '_UMLS'], t8], axis=1) \
        .apply(umls_filter, axis=1, result_type='reduce', string=True)

    print(f'Done, {notfound} names not mapped')
    return df_new

df_tier = run_tiered_analysis(df["endpoint-name"], linker, nlp, docs)
df_comb = pd.concat([df[["endpoint-name"]], df_tier], axis=1)
print(df_comb.head())


Running tier 1 on endpoint-name
215 names mapped
Running tier 2 on endpoint-name
170 names mapped
Running tier 3 on endpoint-name
406 names mapped
Running tier 4 on endpoint-name
6 names mapped
Running tier 7 on endpoint-name
1776 names mapped
Running tier 8 on endpoint-name
Done, 182 names not mapped
         endpoint-name                 endpoint-name_UMLS endpoint-name_Tier
0   Fatty Acid Balance       Fatty Acids (C0015684; T109)             Tier 7
1  Fatty Acids, Total        Fatty Acids (C0015684; T109)             Tier 2
2      Embolus, Aortic    aortic embolus (C0741165; T046)             Tier 2
3       Cardiomyopathy  Cardiomyopathies (C0878544; T047)             Tier 1
4     Heart, Neoplasms    Heart Neoplasm (C0018809; T191)             Tier 1
