### Import packages

In [1]:
import pandas as pd
import json
import requests
import lxml.html as lh
from lxml.html import fromstring
import numpy as np
from copy import deepcopy

### Import ner output data

In [19]:
df = pd.read_csv('ner_output/ex_51-100_ner.csv')

### Inspect data

In [3]:
df.shape

(1202, 8)

In [5]:
df['ner_model'].value_counts()

ner_clinical      440
ner_diseases      321
ner_healthcare    312
ner_jsl           129
Name: ner_model, dtype: int64

### Remove stopwords

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/sd363/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
#replace NaN with blank string to make it easier for string manipulation
df = df.fillna('')

In [22]:
#make NER terms lowercase
df['text'] = [x.lower() for x in df['text']]

In [23]:
#remove stopwords
df['text'] = df['text'].str.split().apply(lambda x: ' '.join(k for k in x if k not in stop))

# Define UMLS function

In [31]:
uri="https://utslogin.nlm.nih.gov"
#option 1 - username/pw authentication at /cas/v1/tickets
#auth_endpoint = "/cas/v1/tickets/"
#option 2 - api key authentication at /cas/v1/api-key
auth_endpoint = "/cas/v1/api-key"

class Authentication:

    #def __init__(self, username,password):
    def __init__(self, apikey):
        #self.username=username
        #self.password=password
        self.apikey=apikey
        self.service="http://umlsks.nlm.nih.gov"

    def gettgt(self):
        #params = {'username': self.username,'password': self.password}
        params = {'apikey': self.apikey}
        h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
        r = requests.post(uri+auth_endpoint,data=params,headers=h)
        response = fromstring(r.text)
        ## extract the entire URL needed from the HTML form (action attribute) returned - looks similar to https://utslogin.nlm.nih.gov/cas/v1/tickets/TGT-36471-aYqNLN2rFIJPXKzxwdTNC5ZT7z3B3cTAKfSc5ndHQcUxeaDOLN-cas
        ## we make a POST call to this URL in the getst method
        tgt = response.xpath('//form/@action')
        return tgt

    def getst(self,tgt):
        params = {'service': self.service}
        h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
        r = requests.post(tgt,data=params,headers=h)
        st = r.text
        return st

In [26]:
apikey = 'aa02411e-1700-4d59-8821-9a62d6851f4e'

In [27]:
def GetTGT(apikey):
    tgt = Authentication(apikey).gettgt()[0]
    return tgt

In [28]:
def GetTicket(apikey, tgt):
    ticket = Authentication(apikey).getst(tgt)
    return ticket

In [151]:
def StringToCUI(string, apikey, tgt):
    
    #define results container
    results = {}
    
    #get ticket
    ticket = GetTicket(apikey, tgt)
    
    #define request parameters
    query = {'ticket': ticket, 'pageSize': 500}

    #define uri and endpoints
    uri = "https://uts-ws.nlm.nih.gov/rest"
    endpoint = '/search/current?string={0}&sabs=MSH'.format(string)
    
    #execute call
    r = requests.get(uri+endpoint,params=query)
    r.encoding = 'utf-8'
    items  = json.loads(r.text)

    if 'error' in items.keys():
        results['string'] = string
        results['cui'] = ''
        results['num_cuis'] = 0
    if items['result']['results'][0]['name'] == 'NO RESULTS':
        results['string'] = string
        results['cui'] = ''
        results['num_cuis'] = 0
    else:
        results['string'] = string
        results['cui'] = items['result']['results'][0]['ui']
        results['num_cuis'] = len(items['result']['results'])

    return results

In [102]:
def CUIToMSH(cui, apikey, tgt):
    #define results container
    results = {}
    
    #get ticket
    ticket = GetTicket(apikey, tgt)
    
    #define request parameters
    query = {'ticket': ticket, 'pageSize': 500}

    #define uri and endpoints
    uri = "https://uts-ws.nlm.nih.gov/rest"
    endpoint = '/content/current/CUI/{0}/atoms?sabs=MSH'.format(cui)
    
    #execute call
    r = requests.get(uri+endpoint,params=query)
    r.encoding = 'utf-8'
    items  = json.loads(r.text)

    if 'error' in items.keys():
        results['cui'] = cui
        results['mesh_ids'] = []
        results['num_mesh_ids'] = 0
    else:
        mesh_ids = [x.split('/')[-1] for x in pd.DataFrame(items['result'])['sourceDescriptor']]
        results['cui'] = cui
        results['mesh_ids'] = list(set(mesh_ids)) #get list of unique mesh ids corresponding to a given CUI
        results['num_mesh_ids'] = len(results['mesh_ids'])
        
    return results

### Execute UMLS API Calls

In [140]:
tgt = GetTGT(apikey)
tgt

'https://utslogin.nlm.nih.gov/cas/v1/api-key/TGT-519519-utdzez0ffldEiKCrK6qd9JWMV1hAe56M7cyjsbvgMVKHfGgkyg-cas'

In [142]:
a = [StringToCUI(x, apikey, tgt) for x in df['text'].unique().tolist()]

In [154]:
b = [CUIToMSH(x, apikey, tgt) for x in pd.DataFrame(a)['cui'].unique().tolist()]

In [197]:
pd.DataFrame(b)

Unnamed: 0,cui,mesh_ids,num_mesh_ids
0,C0003864,[D001168],1
1,,[],0
2,C0037315,[D012891],1
3,C0011875,[D003925],1
4,C0038454,[D020521],1
...,...,...,...
162,C0002950,[D000785],1
163,C0043250,[D014947],1
164,C0005938,[D015519],1
165,C0002033,[D000461],1


In [192]:
params = {
    'format': 'JSON',
    'query': '''PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>

SELECT ?treeNum ?ancestorTreeNum ?ancestor ?alabel
FROM <http://id.nlm.nih.gov/mesh>

WHERE {
   mesh:D002056 meshv:treeNumber ?treeNum .
   ?treeNum meshv:parentTreeNumber+ ?ancestorTreeNum .
   ?ancestor meshv:treeNumber ?ancestorTreeNum .
   ?ancestor rdfs:label ?alabel
   FILTER(REGEX(?treeNum,"C"))
}

ORDER BY ?ancestorTreeNum'''
}

In [193]:
sparql = requests.get('http://id.nlm.nih.gov/mesh/sparql', params=params)

In [194]:
sparql.encoding = 'utf-8'
bindings = json.loads(sparql.text)['results']['bindings']
tree_num = [x['ancestorTreeNum']['value'] for x in bindings]
tree_lab = [x['alabel']['value'] for x in bindings]

In [195]:
tree_lab

['Wounds and Injuries']

In [196]:
tree_num

['http://id.nlm.nih.gov/mesh/C26']