# Data

In [1]:
import requests
import json
import pandas as pd
from pandas import json_normalize

# Display settings
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
%load_ext google.colab.data_table

# Mount drive to save results
from google.colab import drive
drive.mount('/content/drive')

# https://data.cms.gov/cms-innovation-center-programs/cms-innovation-models-overview/innovation-center-data-and-reports
cms_req = requests.get('https://data.cms.gov/data-api/v1/dataset/c0451a3a-a86c-4bd4-a0b7-c93e6b1f1257/data')
cms_dict = json.loads(cms_req.text)
cms_df = json_normalize(cms_dict)

In [1]:
# Oncology sample texts from https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/27.Oncology_Model.ipynb
onco_1 = '''A 65-year-old woman had a history of debulking surgery, bilateral oophorectomy with omentectomy, total anterior hysterectomy with radical pelvic lymph nodes dissection due to ovarian carcinoma (mucinous-type carcinoma, stage Ic) 1 year ago. Patient's medical compliance was poor and failed to complete her chemotherapy (cyclophosphamide 750 mg/m2, carboplatin 300 mg/m2). Recently, she noted a palpable right breast mass, 15 cm in size which nearly occupied the whole right breast in 2 months. Core needle biopsy revealed metaplastic carcinoma. Neoadjuvant chemotherapy with the regimens of Taxotere (75 mg/m2), Epirubicin (75 mg/m2), and Cyclophosphamide (500 mg/m2) was given for 6 cycles with poor response, followed by a modified radical mastectomy (MRM) with dissection of axillary lymph nodes and skin grafting. Postoperatively, radiotherapy was done with 5000 cGy in 25 fractions. The histopathologic examination revealed a metaplastic carcinoma with squamous differentiation associated with adenomyoepithelioma. Immunohistochemistry study showed that the tumor cells are positive for epithelial markers-cytokeratin (AE1/AE3) stain, and myoepithelial markers, including cytokeratin 5/6 (CK 5/6), p63, and S100 stains. Expressions of hormone receptors, including ER, PR, and Her-2/Neu, were all negative. The dissected axillary lymph nodes showed metastastic carcinoma with negative hormone receptors in 3 nodes. The patient was staged as pT3N1aM0, with histologic tumor grade III.'''

onco_2 = '''She underwent a computed tomography (CT) scan of the abdomen and pelvis, which showed a complex ovarian mass. A Pap smear performed one month later was positive for atypical glandular cells suspicious for adenocarcinoma. The pathologic specimen showed extension of the tumor throughout the fallopian tubes, appendix, omentum, and 5 out of 5 enlarged lymph nodes. The final pathologic diagnosis of the tumor was stage IIIC papillary serous ovarian adenocarcinoma. Two months later, the patient was diagnosed with lung metastases.'''

onco_3 = '''In the bone- marrow (BM) aspiration, blasts accounted for 88.1% of ANCs, which were positive for CD9, CD10, CD13, CD19, CD20, CD34, CD38, CD58, CD66c, CD123, HLA-DR, cCD79a, and TdT on flow cytometry.

Measurements of serum tumor markers showed elevated level of cytokeratin 19 fragment (Cyfra21-1: 4.77 ng/mL), neuron-specific enolase (NSE: 19.60 ng/mL), and squamous cell carcinoma antigen (SCCA: 2.58 ng/mL). The results were negative for serum carbohydrate antigen 125 (CA125), carcinoembryonic antigen (CEA) and vascular endothelial growth factor (VEGF). Immunohistochemical staining showed positive staining for CK5/6, P40 and PD-L1 (+ 80% tumor cells), and negative staining for TTF-1, PD-1 and weakly positive staining for ALK. Molecular analysis indicated no EGFR mutation or ROS1 fusion.'''

# spaCy

**Some resources:**
* [spaCy website](https://spacy.io/)
* [medspaCy GitHub](https://github.com/medspacy/medspacy)

In [None]:
!python -m spacy download en_core_web_trf
import spacy

# roBERTa
roberta_nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]) # Disable everything except NER
# spacy.prefer_gpu()

# Process data
doc = roberta_nlp(onco_1)

# Predict
preds = [{'text': word.text, 'entity_detected': word.label_} for word in doc.ents]
preds_df = pd.DataFrame(preds)

In [6]:
from spacy import displacy

In [7]:
displacy.render(doc, jupyter = True, style = 'ent')

In [None]:
!pip install medspacy
!pip install --no-deps https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl
!pip install spacy-transformers

In [21]:
import medspacy
nlp = medspacy.load("en_core_med7_trf")

In [22]:
nlp.pipe_names

['transformer',
 'ner',
 'medspacy_pyrush',
 'medspacy_target_matcher',
 'medspacy_context']

In [23]:
ner = nlp.get_pipe("ner")

In [24]:
ner.labels

('DOSAGE', 'DRUG', 'DURATION', 'FORM', 'FREQUENCY', 'ROUTE', 'STRENGTH')

In [25]:
doc = nlp(onco_1)

In [26]:
doc.ents

(chemotherapy,
 cyclophosphamide,
 750 mg/m2,
 carboplatin,
 300 mg/m2,
 chemotherapy,
 Taxotere,
 75 mg/m2,
 Epirubicin,
 75 mg/m2,
 Cyclophosphamide,
 500 mg/m2,
 for 6 cycles)

In [27]:
visualize_ent(doc)

# Flair
Code below is compatible with vanilla Colab runtime. It should also work with Databricks runtime >= 12.0 (doesn't work with 11.3 LTS :( ).

**Some resources:**
* [Flair's GitHub](https://github.com/flairNLP/flair)
* [NER with Flair tutorial](https://flairnlp.github.io/docs/tutorial-basics/tagging-entities)
* [Paper w/ Hunflair results](https://academic.oup.com/bioinformatics/article/37/17/2792/6122692), showing it beats or is competitive with SOTA methods like SciBERT and BioBERT

In [None]:
!pip install flair
from flair.nn import Classifier
from flair.data import Sentence
from flair.splitter import SegtokSentenceSplitter

# load general NER tagger (large)
flair_lg = Classifier.load('ner-ontonotes-large')

# BioNER
flair_bio = Classifier.load('bioner')

In [74]:
# -------------------------------
# Custom NER functions
# -------------------------------

def split_sentences(text):

  split_list = []

  for i, ele in enumerate(text):

    # use splitter to split text into list of sentences
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(ele)

    split_list.append({'id': i, 'input_text': ele, 'sentences': sentences})

  split_df = pd.DataFrame(split_list)

  return split_df

def run_NER(df, tagger = flair_bio, label_type = None):
  '''
  Attributes:
    df: pandas DataFrame with a unique identifier column (id), and a list-column with Flair sentences corresponding to the id (specifically designed to take the output of split_sentences)
  '''
  ner_results = []

  for index, row in df.iterrows():

    sentences = row['sentences']
    
    # predict tags for sentences
    tagger.predict(sentences)

    for sentence in sentences:

      if label_type is not None:
        sentence_results = [{'id': row['id'], 'text': x.data_point.text, 'entity_detected': x.value, 'score': x.score} for x in sentence.get_labels('ner')]

      # If running with the bio model, don't return 'ner' type labels
      else:
        sentence_results = [{'id': row['id'], 'text': x.data_point.text, 'entity_detected': x.value, 'score': x.score} for x in sentence.get_labels()]

      if sentence_results:
        ner_results += sentence_results

  return pd.DataFrame(ner_results)

In [75]:
split_df = split_sentences([onco_1, onco_2, onco_3])
ner_results = run_NER(split_df) # run with default hunflair tagger

In [77]:
ner_results.to_csv('drive/MyDrive/GitHub/NERding-out/data/JSLonco_text_Flair_preds.csv')

# Text analytics for health

In [None]:
!pip install azure-ai-textanalytics==5.2.0
import yaml
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

# Pull in key and endpoint
with open('/content/drive/MyDrive/GitHub/NERding-out/text_analytics_for_health.yaml', 'r') as stream:
    d = yaml.safe_load(stream)

key = d['key']
endpoint = d['endpoint']

# ---------------------------
# Authentication function
# ---------------------------

def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client


# ----------------
# NER function
# ----------------

def microsoft_health_ner(client, documents: list):

  # Run NER
  poller = client.begin_analyze_healthcare_entities(documents)
  result = poller.result()
  docs = [doc for doc in result if not doc.is_error]

  # Extract predictions
  tafh_preds = []

  for doc_id, doc in enumerate(docs):
    tafh_preds += [{'doc_id': doc_id,
                  'text': x.text, 
                  'text_normalized': x.normalized_text, 
                  'category': x.category, 
                  'subcategory': x.subcategory, 
                  'offset': x.offset, 
                  'confidence_score': x.confidence_score} for x in doc.entities]

  return pd.DataFrame(tafh_preds)

In [73]:
microsoft_ner_results = microsoft_health_ner(client = client, documents = [onco_1, onco_2, onco_3])
microsoft_ner_results['text_nchar'] = microsoft_ner_results['text'].str.len()
microsoft_ner_results['offset_end'] = microsoft_ner_results['offset'] + microsoft_ner_results['text_nchar'] 

microsoft_ner_results.to_csv('/content/drive/MyDrive/GitHub/NERding-out/data/JSLonco_text_Microsoft_text4health_preds.csv')

In [136]:
microsoft_ner_results.query('doc_id == 0')

Unnamed: 0,doc_id,text,text_normalized,category,subcategory,offset,confidence_score,text_nchar,offset_end
0,0,65-year-old,human old age (65+),Age,,2,1.00,11,13
1,0,woman,Woman,Gender,,14,1.00,5,19
2,0,debulking surgery,Tumor Debulking,TreatmentName,,37,0.99,17,54
3,0,bilateral,,Direction,,56,0.94,9,65
4,0,oophorectomy,Ovariectomy,TreatmentName,,66,0.98,12,78
...,...,...,...,...,...,...,...,...,...
65,0,nodes,Anatomic Node,BodyStructure,,1412,0.93,5,1417
66,0,staged,With staging,ExaminationName,,1435,0.79,6,1441
67,0,pT3N1aM0,,ConditionScale,,1445,0.68,8,1453
68,0,histologic tumor,,ExaminationName,,1460,0.79,16,1476


In [163]:
# ---------------------------------------
# Display the labels on the target text
# ---------------------------------------

# Format the entities and entity positions for displacy
ents = []
for i, row in microsoft_ner_results.query('doc_id == 0').iterrows():
  ents += [{'start': row['offset'], 'end': row['offset_end'], 'label': row['category']}]

unique_ent_types = microsoft_ner_results.category.unique()

# Display the labels on the target text
colors = {}
for i in unique_ent_types:
  colors[i] = "linear-gradient(90deg, #aa9cfc, #fc9ce7)"

options = {"ents": list(microsoft_ner_results.category), "colors": colors}

dic_ents = {"text": onco_1, 
            "ents": ents, 
            "title": None}

displacy.render(dic_ents, manual = True, style = "ent", jupyter = True, options = options)

# GPT-3.5

Code below is compatible with vanilla Colab runtime.

**Some resources:**
* https://github.com/openai/openai-python

In [None]:
!pip install openai
import time
import yaml
import openai

# Get API secret (make sure to open a paid acct or secret still won't work)
with open('/content/drive/MyDrive/GitHub/NERding-out/openai.yaml', 'r') as stream:
    d = yaml.safe_load(stream)

openai.api_key = d

# Instantiate connection to chatGPT
chatgpt = openai.ChatCompletion()

# ------------------------------------------------------
# Custom NER functions, based on work by Mo Guiga
# ------------------------------------------------------

def generate_messages(prompts, NER_task = False, entity = None, text = None):

  if NER_task:
    messages = [{'role': 'user', 'content': f'Your job is to extract the {entity} described in the provided text:'},
                {'role': 'user', 'content': text},
                {'role': 'user', 'content': f'Respond ONLY with the {entity}.'},
                {'role': 'user', 'content': f'If {entity} is not found, respond ONLY with None.'}]
  
  else:
    messages = [{'role': 'user', 'content': x} for x in prompts]

  return messages


def ping_gpt(messages, model = 'gpt-3.5-turbo'):
    
    time.sleep(4)

    wait = True
    seconds = 2
    retries = 1

    while wait:
        try:
            response = chatgpt.create(model = model, messages = messages).choices[0].message.content

        except:
            time.sleep(seconds)
            print(f'Retry #{retries}, waiting for an additional {seconds} seconds...')
            retries += 1
            seconds += 1
            
            if retries > 5:
                return None
        else:
            return response

In [100]:
cancer_stg_prompt = generate_messages(NER_task = True, entity = "patient's cancer stage", text = onco_1)
s1_gpt_result = ping_gpt(cancer_stg_prompt)

In [101]:
s1_gpt_result

'pT3N1aM0'

In [102]:
cancer_stg_prompt2 = generate_messages(NER_task = True, entity = "patient's FIRST diagnosed cancer stage", text = onco_1)
s1_gpt_result2 = ping_gpt(cancer_stg_prompt2)

In [103]:
s1_gpt_result

'Stage Ic.'

In [107]:
time_prompt1 = generate_messages(prompts = ["when was the patient first diagnosed? Respond ONLY with the timeframe of the first diagnosis", onco_1])
s1_gpt_answer = ping_gpt(time_prompt1)

In [108]:
s1_gpt_answer

'The patient was first diagnosed with ovarian carcinoma (mucinous-type carcinoma, stage Ic) 1 year ago.'

In [122]:
time_prompt2 = generate_messages(["For the following text, return ONLY the timeframe for the patient's first diagnosis:", onco_1, 
                                  "Respond ONLY with the time at which the patient was first diagnosed. \
                                  Do NOT include their diagnosis or any additional information."])

s1_gpt_answer2 = ping_gpt(time_prompt2)

In [123]:
s1_gpt_answer2

'1 year ago.'