# Intro

The competation home page: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes

We use Spark NLP to train a NER model.
* https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb
* https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.3.prepare_CoNLL_from_annotations_for_NER.ipynb

Note: You need to set up your own licenses!!! Rerun this notebook will not work.

# Import libraries

In [None]:
import os
import json
import pandas as pd
from kaggle_secrets import UserSecretsClient
import warnings
warnings.filterwarnings('ignore')

user_secrets = UserSecretsClient()
license_keys={}
license_keys['SECRET'] = user_secrets.get_secret("SECRET")
license_keys['SPARK_NLP_LICENSE'] = user_secrets.get_secret("SPARK_NLP_LICENSE")
license_keys["PUBLIC_VERSION"] = user_secrets.get_secret("PUBLIC_VERSION")
license_keys["JSL_VERSION"] = user_secrets.get_secret("JSL_VERSION")
license_keys["AWS_ACCESS_KEY_ID"] = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
license_keys["AWS_SECRET_ACCESS_KEY"] = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
license_keys["AWS_SESSION_TOKEN"] = user_secrets.get_secret("AWS_SESSION_TOKEN")
os.environ.update(license_keys)

In [None]:
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [None]:
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp
from sparknlp_jsl.compatibility import Compatibility 
from sparknlp_display import NerVisualizer
from sparknlp.training import CoNLL
from sparknlp_jsl.training import tf_graph
from sparknlp.common import *
from tqdm import tqdm
from collections import Counter

# Start spark session

In [None]:
params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 
spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)
print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

# Utilities

In [None]:
# Generic NER Function with LightPipeline
def get_light_model (embeddings, model_name = 'ner_clinical'):

    documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

    sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

    tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

    word_embeddings = WordEmbeddingsModel.pretrained(embeddings, "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

    loaded_ner_model = MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

    ner_converter = NerConverter() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")

    nlpPipeline = Pipeline(stages=[
      documentAssembler,
      sentenceDetector,
      tokenizer,
      word_embeddings,
      loaded_ner_model,
      ner_converter])

    model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

    return LightPipeline(model)


# Get NER Results with fullAnnotate Method
def get_light_result (light_model, text, chunk_name="ner_chunk"):

    light_result = light_model.fullAnnotate(text)

    chunks = []
    entities = []
    sentence= []
    begin = []
    end = []

    for n in light_result[0][chunk_name]:
        begin.append(n.begin)
        end.append(n.end)
        chunks.append(n.result)
        entities.append(n.metadata['entity']) 
        sentence.append(n.metadata['sentence']) 

    pd_df = pd.DataFrame({'sentence_id':sentence, 
                          'begin': begin, 
                          'end':end, 
                          'chunks':chunks,  
                          'entities':entities})
    
    visualiser = NerVisualizer()
    visualiser.display(light_result[0], label_col='ner_chunk', document_col='document')
    
    return pd_df


# Convert to conll format
def make_conll(text:pd.DataFrame, entity:pd.DataFrame, 
               save_tag:bool=None, 
               save_conll:bool=None, 
               verbose:bool=None, 
               begin_deviation:int=0, 
               end_deviation:int=0 )->str:

    df_text = text.iloc[:,[0,1]]
    df_entity = entity.iloc[:,[0,1,2,3,4]]
    df_text.columns = ['text_id','text']
    df_entity.columns = ['text_id','begin','end','chunk','entity']
    entity_list = list(df_entity.entity.unique())


    ########--------------1.tag transformation function------------########

    def transform_text(text, entities, verbose=None):

        tag_list=[]
        for entity in entities.iterrows():

            begin = entity[1][1] + begin_deviation 
            end = entity[1][2] + end_deviation
            chunk = entity[1][3]
            tag = entity[1][4]
            text = text[:end] + f' </END_NER:{tag}> ' + text[end:]
            text = text[:begin] + f' <START_NER:{tag}> ' + text[begin:]
            tag_list.append(tag)

        sum_of_added_entity = Counter(tag_list)
        sum_of_entity = Counter(entities['entity'].values)

        if verbose:
            print(f'Processed text id   : {entities.text_id.values[:1]}')
            print(f'Original Entities   : {sum_of_entity}\nAdded Entities      : {sum_of_added_entity}')
            print(f'Number Equality     : {sum_of_added_entity == sum_of_entity}')
            print("=="*40)

        if not sum_of_entity == sum_of_added_entity:
            print("There is a problem in text id:")
            print(entities.text_id.values[0])
            raise Exception("Check this text!")

        return text


    ######---------------2.apply_transform_text function ----------------#######

    def apply_tag_ner(df_text, df_entity, save=None, verbose=None):

        for text_id in tqdm(df_text.text_id):
            text  = df_text.loc[df_text['text_id']==text_id]['text'].values[0] 
            entities  = df_entity.loc[(df_entity['text_id']==text_id)].sort_values(by='begin',ascending=False) 

            df_text.loc[df_text['text_id']==text_id, 'text'] = transform_text(text, entities, verbose=verbose)

        if save:
            df_text.to_csv("text_with_ner_tag.csv", index=False, encoding='utf8')

        return df_text


    ##########----------------3.RUNNING TAG FUNCTION---------------#############
    
    print("Text tagging starting. Applying entities to whole text...\n")
    df = apply_tag_ner(df_text, df_entity, save=save_tag, verbose=verbose)


    ###########---------------4.Spark Pipeline-----------------------###########

    def spark_pipeline(df):
        spark_df = spark.createDataFrame(df)

        documentAssembler = DocumentAssembler()\
            .setInputCol("text")\
            .setOutputCol("document")\
            .setCleanupMode("shrink")

        sentenceDetector = SentenceDetector()\
            .setInputCols(['document'])\
            .setOutputCol('sentences')\
            .setExplodeSentences(True)

        tokenizer = Tokenizer() \
            .setInputCols(["sentences"]) \
            .setOutputCol("token")

        nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer ])

        empty_df = spark.createDataFrame([['']]).toDF("text")
        pipelineModel = nlpPipeline.fit(empty_df)

        result = pipelineModel.transform(spark_df.select(['text']))


        return result.select('token.result').toPandas()
    print("\n\nSpark pipeline is running...")
    df_final = spark_pipeline(df)


    #########--------------5.CoNLL Function--------------------#############

    def build_conll(df_final, tag_list, save=None):

        header = "-DOCSTART- -X- -X- O\n\n"
        conll_text = ""
        chunks = []
        tag_list = tag_list
        tag = 'O'      # token tag 
        ct = 'B'       # chunk tag part B or I

        for sentence_tokens in tqdm(df_final.result[:]):
            for token in sentence_tokens:
                if token.startswith("<START_NER:"):
                    tag = token.split(':')[1][:-1]
                    if tag not in tag_list:
                        tag = 'O'
                        conll_text += f'{token} NN NN {tag}\n'

                    continue

                if token.startswith("</END_NER:") and tag != 'O':
                    for i, chunk in enumerate(chunks):
                        ct = 'B' if i == 0 else 'I' 
                        conll_text += f'{chunk} NNP NNP {ct}-{tag}\n'
                    
                    chunks=[]
                    tag='O'
                    continue

                if tag != 'O':    
                    chunks.append(token)
                    continue

                if tag == 'O':
                    conll_text += f'{token} NN NN {tag}\n'             
                    continue

            conll_text += '\n'                                         

        if save:
            with open("conll2003_text_file.conll", "w+", encoding='utf8') as f:
                f.write(header)
                f.write(conll_text)

        print("\nDONE!")    
        return conll_text

        
    ########----------------6.RUNNING CONLL FUNCTION--------------------########

    print("Conll file is being created...\n")
    return build_conll(df_final, tag_list=entity_list, save=save_conll)

# Test pipeline with samples

In [None]:

text ='''The patient was prescribed 1 capsule of Parol with meals . 
He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . 
It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'''

embeddings = 'embeddings_clinical'

model_name = 'ner_posology'

# Uncomment the two lines to test. Note doing this will increase the memory usage.
#light_model = get_light_model (embeddings, model_name)
#get_light_result (light_model, text, chunk_name="ner_chunk")

# Process training data

Convert csv to conll format accepted by Spark NLP.

In [None]:
# Create a text file
dfNotes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv",
                     usecols=["pn_num","pn_history"])
dfNotes.columns = ['text_id','text']
dfNotes.head()

In [None]:
# Create an entity file
dfFeatures = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv",
                        usecols=["feature_num","feature_text"])
dfTrain = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
dfEnt = dfTrain.join(dfFeatures,on="feature_num",rsuffix='_other')
dfEnt = dfEnt[["pn_num","location","annotation","feature_text"]]
dfEnt = dfEnt.dropna()
dfEnt = dfEnt[dfEnt["annotation"]!="[]"]
dfEnt = dfEnt[~dfEnt.location.str.contains(";")]
dfEnt['location'] = dfEnt.location.apply(lambda x: x[2:-2].split("', '"))
dfEnt['annotation'] = dfEnt.annotation.apply(lambda x: x[2:-2].split("', '"))
dfEnt['locationLen'] = dfEnt.location.apply(lambda x: len(x))
dfEnt['annotationLen'] = dfEnt.annotation.apply(lambda x: len(x))
dfEnt = dfEnt[dfEnt["annotationLen"]==dfEnt["locationLen"]].explode(["location","annotation"])
dfEnt['begin'] = dfEnt.location.apply(lambda x: x.split(" ")[0])
dfEnt['end'] = dfEnt.location.apply(lambda x: x.split(" ")[1])
dfEnt = dfEnt[["pn_num","begin","end","annotation","feature_text"]]
dfEnt.columns = ['text_id','begin','end','chunk','entity']
dfEnt = dfEnt.astype({'begin': 'int32','end': 'int32'})
dfEnt.head()

# Sample for debug. Using the entire training data will lead to OOM.
dfEnt = dfEnt.sample(n=20) 
dfNotes = dfNotes.join(dfEnt,how="inner",on="text_id",rsuffix='_other')
dfNotes = dfNotes[['text_id','text']]

In [None]:
# Convert to conll and save
conll_text = make_conll(dfNotes,dfEnt,save_conll=True)

In [None]:
# Load conll   
data = CoNLL().readDataset(spark, "./conll2003_text_file.conll")
data.show()

# Train

In [None]:
clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
                        .setInputCols(["sentence", "token"])\
                        .setOutputCol("embeddings")

In [None]:
tf_graph.print_model_params("ner_dl")

tf_graph.build("ner_dl", 
               build_params={"embeddings_dim": 200, 
                             "nchars": 83, 
                             "ntags": 12, 
                             "is_medical": 1}, 
               model_location="./medical_ner_graphs", 
               model_filename="auto")

In [None]:
nerTagger = MedicalNerApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(30)\
    .setBatchSize(64)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setValidationSplit(0.2)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setOutputLogsPath('ner_logs')\
    .setGraphFolder('medical_ner_graphs')\
    .setUseBestModel(True)\
    .setEarlyStoppingCriterion(0.04)\
    .setEarlyStoppingPatience(3)\
    .setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch       

ner_pipeline = Pipeline(stages=[
          clinical_embeddings,
          nerTagger
 ])

In [None]:
%%time
ner_model = ner_pipeline.fit(data)

In [None]:
ner_model.stages[1].getTrainingClassDistribution()

In [None]:
log_file= os.listdir("ner_logs")[0]

with open (f"./ner_logs/{log_file}") as f:
    print(f.read())