In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("abhibisht89/spanbert-large-cased-finetuned-ade_corpus_v2")
model = AutoModelForTokenClassification.from_pretrained("abhibisht89/spanbert-large-cased-finetuned-ade_corpus_v2")

In [2]:
from transformers import pipeline
ner = pipeline("ner",model=model,tokenizer=tokenizer)

In [3]:
# time to test this detection on some test samples

In [4]:
import pandas as pd
from config import OUTPUT_DIR
from os.path import join as pjoin

In [5]:
csv_file = pjoin(OUTPUT_DIR,"n2c2.csv")
df = pd.read_csv(csv_file)
df

Unnamed: 0.1,Unnamed: 0,sample_id,raw,annotated
0,0,108809,Admission Date: [**2112-10-6**] ...,T1\tDrug 19078 19085\tAspirin\nT2\tStrength 19...
1,1,115143,Admission Date: [**2146-4-3**] D...,T1\tDrug 5569 5576;5577 5583\tInsulin Lispro\n...
2,2,159079,Admission Date: [**2110-8-7**] D...,T1\tStrength 10013 10017\t20mg\nR1\tStrength-D...
3,3,102027,Admission Date: [**2119-9-16**] ...,T1\tDrug 1014 1017\tFFP\nT2\tDosage 1171 1178\...
4,4,102296,Admission Date: [**2172-9-28**] ...,T1\tDrug 112 122\tQuinolones\nT4\tDrug 1135 11...
...,...,...,...,...
298,298,130440,Admission Date: [**2185-7-6**] D...,T1\tDrug 3838 3844\tflagyl\nT2\tDrug 6019 6032...
299,299,118510,Admission Date: [**2183-1-9**] D...,T1\tDrug 11649 11656\tBicitra\nT3\tDrug 14147 ...
300,300,174150,Admission Date: [**2186-6-14**] ...,T1\tDrug 1674 1682\tCefepime\nT2\tStrength 168...
301,301,107047,Admission Date: [**2134-10-9**] ...,T1\tStrength 14094 14099\t20 mg\nT2\tForm 1411...


In [6]:
from io import StringIO
sample_index = 0
row = df.iloc[sample_index]
raw_text = row['raw']
annotated_tsv = pd.read_csv(StringIO(row['annotated']),sep='\t',header=None,names=['type','name','value'])
annotated_tsv

Unnamed: 0,type,name,value
0,T1,Drug 19078 19085,Aspirin
1,T2,Strength 19086 19092,325 mg
2,T3,Route 19093 19095,PO
3,T4,Drug 19465 19478;19479 19482,Ciprofloxacin HCl
4,T5,Strength 19483 19489,500 mg
...,...,...,...
168,R69,Reason-Drug Arg1:T102 Arg2:T59,
169,R70,Reason-Drug Arg1:T103 Arg2:T16,
170,R71,ADE-Drug Arg1:T94 Arg2:T8,
171,T93,Route 12791 12793,IV


In [7]:
inputs = tokenizer("Hello world i have a slight fever",return_tensors="pt")
outputs = model(**inputs)

In [8]:
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 1.9963, -0.7155, -0.7707, -0.1233, -0.4926],
         [ 3.8603, -1.0510, -1.3438, -1.0383, -1.1842],
         [ 3.9350, -1.2033, -1.1814, -1.3157, -1.0649],
         [ 3.7329, -0.9513, -1.3155, -1.1470, -1.1257],
         [ 3.9507, -1.3179, -1.1036, -1.3013, -1.0481],
         [ 3.9218, -1.3843, -1.2534, -1.0409, -1.0559],
         [ 2.6338, -1.6201, -1.6524,  1.0794, -1.1400],
         [-0.9270, -1.1644, -1.2274,  3.3766, -0.2071],
         [ 1.9958, -0.7154, -0.7709, -0.1228, -0.4926]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [9]:
ner("Hello world i have a slight fever")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-ADR',
  'score': 0.94164777,
  'index': 7,
  'word': 'fever',
  'start': 28,
  'end': 33}]

In [10]:
test = ner(raw_text[:512])

In [49]:
import numpy as np
from functools import reduce
from typing import List, Dict
def verify_sub_words_are_consecutive(stack):
    indices = [el['index'] for subword in stack]
    assert reduce(lambda holds_true_prev, curr_pair: curr_pair[0] + 1 == curr_pair[1],zip(indices,indices[1:]),True)

def word_accumulator_fn(info_dict : Dict[str,List],curr_tok):
    if curr_tok['entity'].startswith('I'):
        if len(info_dict['curr_stack']) == 0:
            return info_dict # just don't modify it
        try:
            assert info_dict['curr_stack'][0]['entity'].startswith('B'), "WHY DOES IT TAG SOMETHING WITHOUT A BEGINNING WTF?"
        except AssertionError:
            import ipdb; ipdb.set_trace()
        info_dict['curr_stack'].append(curr_tok)
    else: #starts with B
        assert curr_tok['entity'].startswith('B'), "THERE IS SOME SUSSY STUFF GOING ON WITH THE TAGGER. OUTPUT OTHER THAN I- OR B-???"
        info_dict['word_list'].append(info_dict['curr_stack'])
        info_dict['curr_stack'] = [curr_tok]
    return info_dict
        
def condense_stack(stack):
    condensed_word = ''.join([part['word'][2:] if part['word'].startswith("##") else ' ' + part['word'] for part in stack]).lstrip()
    avg_score = np.mean([part['score'] for part in stack])
    return condensed_word, avg_score

def get_tokens_from_ner_specific(raw_outputs,entity_type):
    '''gets proper tokens from nlp pipeline (merges subwords as well)'''
    # output is a list of {entity : str, score : float, index : int, word : str, start : int, end : int}
    # goal: merge consecutive indices into one word
    outputs = filter(lambda output: output['entity'].endswith(entity_type),raw_outputs)
    word_stack = reduce(word_accumulator_fn,outputs,dict(word_list=[],curr_stack=[]))
    all_stacks = word_stack['word_list']
    if len(word_stack['curr_stack']) > 0: 
        all_stacks.append(word_stack['curr_stack'])
    (verify_sub_words_are_consecutive(stack) for stack in all_stacks) # little assert statement for sanity check 
    return list(filter(lambda el: el[0] != '',map(condense_stack,word_stack['word_list'])))

def get_tokens_from_ner(raw_outputs,entity_list=['ADR','DRUG']):
    return {entity_type:get_tokens_from_ner_specific(raw_outputs,entity_type) for entity_type in entity_list} 

def identify_info(text):
    return get_tokens_from_ner(ner(text))

def identify_infos(texts):
    return [get_tokens_from_ner(output) for output in ner(texts)]

In [50]:
identify_info(raw_text[:512])

{'ADR': [('abdominal pain', 0.93423426),
  ('chest and abdominal pain', 0.8709032),
  ('odynophagia', 0.962869)],
 'DRUG': [('ativan', 0.96955997)]}

In [51]:
annotated_tsv['value'].values

array(['Aspirin', '325 mg', 'PO', 'Ciprofloxacin HCl', '500 mg', 'Q12H',
       'PO', 'ativan', 'butalbital -acetaminophen-caff',
       'Metoprolol Tartrate', '12.5 mg', 'BID', 'PO', 'Omeprazole',
       '40 mg', 'Ativan', 'DiCYCLOmine', '10 mg', 'QID', 'PO', 'Aspirin',
       'Ativan', '50 mg -325 mg-40 mg', 'Omeprazole', '40 mg', 'PO',
       'Acetaminophen-Caff- Butalbital', 'TAB', 'Q6H :PRN', 'PO',
       'ciprofloxacin [Cipro]', '500 mg', 'tablet (s)', '1', 'by mouth',
       'Aspirin EC', '325 mg', 'PO', 'metoprolol tartrate', '25 mg',
       'tablet (s)', 'by mouth', 'ATIVAN', 'Aspirin', '325mg',
       'once daily', 'Atorvastatin', '20 mg', 'PO', 'DiCYCLOmine',
       '10 mg', 'QID', 'PO', 'Atorvastatin', '20 mg', 'PO', 'Tylenol',
       'PPI', 'GI cocktail', 'delerious', 'Hadol', 'agitation', 'PO',
       'DAILY', 'Donnatol', '10 mL', 'PO', 'BID:PRN', 'abdominal pain',
       'DAILY', 'DAILY', 'DAILY', 'DAILY', 'DAILY', 'headache',
       'tablet(s)', 'by mouth', 'q 4 hours',

In [68]:
from nltk.tokenize import sent_tokenize

def predict_text(text,key):
    return [ad for sentence_output in identify_infos(sent_tokenize(text)) for ad in sentence_output[key]]

def extract_entities_text(text):
    return {key: predict_text(text,key) for key in ['ADR','DRUG']}

def extract_drug_names_pred(text):
    return predict_text(text,'DRUG')

def extract_adv_names_pred(text):
    return predict_text(text,'ADR')

In [69]:
annotated_tsv[annotated_tsv['type'] == "T94"]

Unnamed: 0,type,name,value
154,T94,ADE 17894 17908,acute delerium


In [70]:
def is_drug(row):
    return row['name'].startswith("Drug") and row['type'].startswith("T")

def is_adv(row):
    return row['name'].startswith("ADE") and row['type'].startswith("T")
def extract_drug_names_truth(annotated_tsv):
    return [row['value'].lower() for i , row in annotated_tsv.iterrows() if is_drug(row)]

def extract_adv_names_truth(annotated_tsv):
    return [row['value'].lower() for i , row in annotated_tsv.iterrows() if is_adv(row)]
# matching_rows