# UMLS API

This is a brief into info how to use this query the UMLS API using Python.

Details on the API: https://documentation.uts.nlm.nih.gov/rest/home.html

First, run the cell below.

In [7]:
import requests
from lxml.html import fromstring
import json
import time
import re
import os


class query_umls:
    """Query umls api."""

    def __init__(self, apikey=None, atoms=True, definitions=True, types=True):
        """Init variables."""
        # info for authentication
        self.apikey = apikey
        self.auth_service = "http://umlsks.nlm.nih.gov"
        self.auth_uri = "https://utslogin.nlm.nih.gov"
        self.auth_endpoint = "/cas/v1/api-key"
        self.tgt = None
        self.tgt_time = None
        self._gettgt()

        # what to request
        self.options = {'atoms': atoms, 'definitions': definitions,
                        'types': types}

        # info for data requests
        self.data_uri = 'https://uts-ws.nlm.nih.gov/rest'
        self.data_endpoint = '/content/current/CUI/'

    def set_options(self, atoms=True, definitions=True, types=True):
        """Set options without resetting tgt."""
        self.options = {'atoms': atoms, 'definitions': definitions,
                        'types': types}

    def _gettgt(self):
        """Get tgt."""
        params = {'apikey': os.environ.get('UMLS_API_KEY', '')
                  if self.apikey is None else self.apikey}
        h = {"Content-type": "application/x-www-form-urlencoded",
             "Accept": "text/plain", "User-Agent": "python"}
        r = requests.post(self.auth_uri+self.auth_endpoint,
                          data=params, headers=h)
        response = fromstring(r.text)
        tgt = response.xpath('//form/@action')[0]
        self.tgt_time = time.time()
        self.tgt = tgt

    def _tgt_check(self):
        """Check to see if tgt is expired."""
        now = time.time()
        if now-self.tgt_time > (28800-300):
            self._gettgt()

    def _getst(self):
        """Get single use token."""
        self._tgt_check()
        params = {'service': self.auth_service}
        h = {"Content-type": "application/x-www-form-urlencoded",
             "Accept": "text/plain", "User-Agent": "python"}
        r = requests.post(self.tgt, data=params, headers=h)
        st = r.text
        return st

    def get_info(self, cuis=[]):
        """Get info for a list of cuis."""
        if isinstance(cuis, str):
            cuis = [cuis]
        result_list = []
        for cui in cuis:
            params = {'ticket': self._getst()}
            r = requests.get(self.data_uri + self.data_endpoint + cui,
                             params=params)
            if r.status_code != 200:
                print(f'Concept "{cui}" not found.')
                result_list.append({})
                continue
            d = json.loads(r.text)
            result_list.append(self._parse_data(d['result'], self.options))
        return result_list

    def _parse_data(self, res, options):
        """Parse json for info."""
        cui_data = {'name': None, 'atoms': None,
                    'definitions': None, 'types': None}
        cui_data['name'] = res['name']

        atom_url = res['atoms']
        source_list = []
        if options['atoms'] and atom_url is not None and \
                atom_url.lower() != 'none':
            atoms_list, source_list = zip(*self._get_atoms(atom_url))
            cui_data['atoms'] = set([i for i in atoms_list if i is not None])
            source_list = set([i for i in source_list if i is not None])

        def_url = res['definitions']
        if options['definitions'] and def_url is not None and \
                def_url.lower() != 'none':
            cui_data['definitions'] = \
                self._get_definitions(def_url, source_list)

        tui_list = res['semanticTypes']
        if options['types'] and tui_list is not None and tui_list != 'NONE':
            cui_data['types'] = self._get_semanticTypes(tui_list)

        return cui_data

    def _get_atoms(self, url):
        """Get atom names for CUI."""
        result_list = []
        currentPage = 1
        while currentPage < 10:  # limit the number of pages
            time.sleep(0.5)
            params = {'ticket': self._getst(), 'language': 'ENG',
                      'pageNumber': currentPage, 'pageSize': 100}
            r = requests.get(url, params=params)
            if r.status_code != 200:
                break
            rd = json.loads(r.text)
            result_list += rd['result']
            if rd['pageNumber'] >= rd['pageCount']:
                break
            currentPage += 1
        if not result_list:
            return [(None, None)]
        result_list = [(i['name'], i['rootSource'])
                       for i in result_list if i['name'] is not None
                       and i['name'].lower() != 'none']
        return result_list

    def _get_definitions(self, url, sources=[]):
        """Get definitions for CUI."""
        result_list = []
        currentPage = 1
        while currentPage < 10:  # limit the number of pages
            time.sleep(0.5)
            params = {'ticket': self._getst(),
                      'pageNumber': currentPage, 'pageSize': 100}
            r = requests.get(url, params=params)
            if r.status_code != 200:
                break
            rd = json.loads(r.text)
            result_list += rd['result']
            if rd['pageNumber'] >= rd['pageCount']:
                break
            currentPage += 1
        if not result_list:
            return set([])
        result_list = [i['value'] for i in result_list
                       if i['value'] is not None
                       and i['value'].lower() != 'none'
                       and i['sourceOriginated'] and
                       (len(sources) == 0 or i['rootSource'] in sources)]
        return set(result_list)

    def _get_semanticTypes(self, tui_list):
        """Get semantic types.

        Pull from url instead of making another request.
        """
        result_list = []
        for info in tui_list:
            tui_name = info['name']
            search_tui = re.search(r'/TUI/(T\d{3})$', info['uri'])
            if search_tui:
                tui_val = search_tui.group(1)
            else:
                continue
            result_list.append((tui_name, tui_val))
        if result_list:
            return result_list
        else:
            return []

Now we can create an query object. It accepts a few parameters, as shown below with their defaults. All arguments are optional. If you do not provide an API key, the script will look for the environmental variable `UMLS_API_KEY`. If you don't pass an API key and there is no environmental variable, you'll get `ParserError`.

Other arguments:
- `atoms`: Whether to pull synonyms
- `definitions`: Whether to pull CUI definitions
- `types`: Whether to pull TUIs

In [13]:
# q = query_umls(apikey=None, atoms=True, definitions=True, types=True)
# q = query_umls()  # needs environmental variable
apikey = ''
q = query_umls(apikey=apikey)

Once you've crested the query object, you can use it call pull data for different CUIs. All you do is provide a list of CUIs to the method `get_info`. The output will be a list of dictionaries with 4 keys: `name`, `atoms`, `definitions`, and `types`. The order will be the same as the input. You can use the same object to run `get_info` as many times as you want.

If a dictionary is empty, the CUI was not found. If a value is `None`, it was not enabled in options or there was no relevant data in UMLS. If an empty list or set, there was most likely something wrong with how the data was parsed.

In [14]:
cui_list = ['C0006106', 'C0007765', 'C0009592']
data = q.get_info(cui_list)
print(f'Data for CUI "{cui_list[0]}"')
print(data[0])

Data for CUI "C0006106"
{'name': 'Brain Chemistry', 'atoms': {'Chemistry, Brain', 'Brain Chemistry', 'Brain Chemistries', 'brain chemistry', 'Brain chemistry', 'Chemistries, Brain'}, 'definitions': {'Changes in the amounts of various chemicals (neurotransmitters, receptors, enzymes, and other metabolites) specific to the area of the central nervous system contained within the head. These are monitored over time, during sensory stimulation, or under different disease states.'}, 'types': [('Organism Function', 'T040')]}


If you want to change what you're requesting without creating a new object, there's an easy way to do that (everytime you recreate the object, a request for a new key is sent).

In [12]:
q.set_options(atoms=False)

One quirk is that if you set `atoms` to `False`, but still request a definition, the definitions may be in many different languages. This is because the only way to filter language for the definitions is to filter by source, and the list of sources to filter by is determined when pulling the list of atoms. It's possible to compile a list of sources external to that method, but it's a very large list.