![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1uGVoZWqx_7O2dP4ntFMLVT2aeL0luj98)

# **Interview Task**
[Running a Spark NLP Healthcare Pipeline and Training a Custom NER Model](https://docs.google.com/document/d/1l_SpYGAlVGAEe9x-b8avgvKipCXetdap2ttc4UKreO4/edit?tab=t.0)  
## **Chapter-II CoNLL File Generation:**


## Setup and imports

In [1]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.4.1 spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.8/620.8 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import json
import os

import pandas as pd
from tqdm import tqdm
from collections import Counter

import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.training import CoNLL

import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


spark = sparknlp.start()

print ("Spark NLP Version :", sparknlp.version())

spark

Spark NLP Version : 5.5.0


## **Convert the predictions (NER tags) from Spark NLP models into the CoNLL format.**

In [10]:
# You can either upload the files manually or download them from the link
#!wget -q https://raw.githubusercontent.com/russell-ai/SparkNLP-CustomNER/refs/heads/main/ner_clinical_mtsamples_ner_results_for_conll.csv -O /content/ner_clinical_mtsamples_ner_results_for_conll.csv
#!wget -q https://raw.githubusercontent.com/russell-ai/SparkNLP-CustomNER/refs/heads/main/mtsamples_texts.csv -O /content/mtsamples_texts.csv

### **Entity File**
This dataframe should include at least five  below columns in order:
*   `['text_id','begin','end','chunk','entity']`

In [9]:
import pandas as pd
train_entities_df = pd.read_csv('/content/ner_clinical_mtsamples_ner_results_for_conll.csv')
train_entities_df= train_entities_df[["text_id", "begin", "end", "chunk", "entity"]]
train_entities_df.head()

Unnamed: 0,text_id,begin,end,chunk,entity
0,0,88,99,Mesothelioma,PROBLEM
1,0,118,129,Mesothelioma,PROBLEM
2,0,132,147,pleural effusion,PROBLEM
3,0,150,168,atrial fibrillation,PROBLEM
4,0,171,176,anemia,PROBLEM


### **Text File**
This dataframe should include at least two below columns in order:
*   `['text_id','text']`

In [11]:

train_text_df = pd.read_csv('/content/mtsamples_texts.csv')
train_text_df.head()

Unnamed: 0,text_id,text
0,0,Sample Type / Medical Specialty:\nHematology -...
1,1,Sample Type / Medical Specialty:\nHematology -...
2,2,Sample Type / Medical Specialty:\nHematology -...
3,3,Sample Type / Medical Specialty:\nHematology -...
4,4,Sample Type / Medical Specialty:\nHematology -...


## **CoNLL Builder**

In [12]:
def make_conll(text:pd.DataFrame, entity:pd.DataFrame,
               save_tag:bool=None,
               save_conll:bool=None,
               verbose:bool=None,
               begin_deviation:int=0,
               end_deviation:int=0 )->str:

    df_text = text.iloc[:,[0,1]]
    df_entity = entity.iloc[:,[0,1,2,3,4]]
    df_text.columns = ['text_id','text']
    df_entity.columns = ['text_id','begin','end','chunk','entity']
    entity_list = list(df_entity.entity.unique())


    ########--------------1.tag transformation function------------########

    def transform_text(text, entities, verbose=None):

        tag_list=[]
        for entity in entities.iterrows():

            begin = entity[1][1] + begin_deviation
            end = entity[1][2] + end_deviation
            chunk = entity[1][3]
            tag = entity[1][4]
            text = text[:end] + f' </END_NER:{tag}> ' + text[end:]
            text = text[:begin] + f' <START_NER:{tag}> ' + text[begin:]
            tag_list.append(tag)

        sum_of_added_entity = Counter(tag_list)
        sum_of_entity = Counter(entities['entity'].values)

        if verbose:
            print(f'Processed text id   : {entities.text_id.values[:1]}')
            print(f'Original Entities   : {sum_of_entity}\nAdded Entities      : {sum_of_added_entity}')
            print(f'Number Equality     : {sum_of_added_entity == sum_of_entity}')
            print("=="*40)

        if not sum_of_entity == sum_of_added_entity:
            print("There is a problem in text id:")
            print(entities.text_id.values[0])
            raise Exception("Check this text!")

        return text


    ######---------------2.apply_transform_text function ----------------#######

    def apply_tag_ner(df_text, df_entity, save=None, verbose=None):

        for text_id in tqdm(df_text.text_id):
            text  = df_text.loc[df_text['text_id']==text_id]['text'].values[0]
            entities  = df_entity.loc[(df_entity['text_id']==text_id)].sort_values(by='begin',ascending=False)

            df_text.loc[df_text['text_id']==text_id, 'text'] = transform_text(text, entities, verbose=verbose)

        if save:
            df_text.to_csv("text_with_ner_tag.csv", index=False, encoding='utf8')

        return df_text


    ##########----------------3.RUNNING TAG FUNCTION---------------#############

    print("Text tagging starting. Applying entities to whole text...\n")
    df = apply_tag_ner(df_text, df_entity, save=save_tag, verbose=verbose)


    ###########---------------4.Spark Pipeline-----------------------###########

    def spark_pipeline(df):
        spark_df = spark.createDataFrame(df)

        documentAssembler = DocumentAssembler()\
            .setInputCol("text")\
            .setOutputCol("document")\
            .setCleanupMode("shrink")

        sentenceDetector = SentenceDetector()\
            .setInputCols(['document'])\
            .setOutputCol('sentences')\
            .setExplodeSentences(True)

        tokenizer = Tokenizer() \
            .setInputCols(["sentences"]) \
            .setOutputCol("token")

        nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer ])

        empty_df = spark.createDataFrame([['']]).toDF("text")
        pipelineModel = nlpPipeline.fit(empty_df)

        result = pipelineModel.transform(spark_df.select(['text']))


        return result.select('token.result').toPandas()
    print("\n\nSpark pipeline is running...")
    df_final = spark_pipeline(df)


    #########--------------5.CoNLL Function--------------------#############

    def build_conll(df_final, tag_list, save=None):

        header = "-DOCSTART- -X- -X- O\n\n"
        conll_text = ""
        chunks = []
        tag_list = tag_list
        tag = 'O'      # token tag
        ct = 'B'       # chunk tag part B or I

        for sentence_tokens in tqdm(df_final.result[:]):
            for token in sentence_tokens:
                if token.startswith("<START_NER:"):
                    tag = token.split(':')[1][:-1]
                    if tag not in tag_list:
                        tag = 'O'
                        conll_text += f'{token} NN NN {tag}\n'

                    continue

                if token.startswith("</END_NER:") and tag != 'O':
                    for i, chunk in enumerate(chunks):
                        ct = 'B' if i == 0 else 'I'
                        conll_text += f'{chunk} NNP NNP {ct}-{tag}\n'

                    chunks=[]
                    tag='O'
                    continue

                if tag != 'O':
                    chunks.append(token)
                    continue

                if tag == 'O':
                    conll_text += f'{token} NN NN {tag}\n'
                    continue

            conll_text += '\n'

        if save:
            with open("conll2003_text_file.conll", "w+", encoding='utf8') as f:
                f.write(header)
                f.write(conll_text)

        print("\nDONE!")
        return conll_text


    ########----------------6.RUNNING CONLL FUNCTION--------------------########

    print("Conll file is being created...\n")
    return build_conll(df_final, tag_list=entity_list, save=save_conll)


### Running the CoNLL Builder

In [13]:
# if you want tagged text or conll file saved in the current directory: just make default 'save_tag' or 'save_conll' parameters True.
conll_text = make_conll(train_text_df, train_entities_df, save_conll=True)

Text tagging starting. Applying entities to whole text...



  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] + begin_deviation
  end = entity[1][2] + end_deviation
  chunk = entity[1][3]
  tag = entity[1][4]
  begin = entity[1][1] +



Spark pipeline is running...





Conll file is being created...



100%|██████████| 422/422 [00:00<00:00, 74254.15it/s]


DONE!





In [23]:
# Checking conll string
print(conll_text[:615])

Sample NN NN O
Type NN NN O
/ NN NN O
Medical NN NN O
Specialty NN NN O
: NN NN O
Hematology NN NN O
- NN NN O
Oncology NN NN O
Sample NN NN O
Name NN NN O
: NN NN O
Discharge NN NN O
Summary NN NN O
- NN NN O
Mesotheliom NNP NNP B-PROBLEM
a NN NN O
- NN NN O
1 NN NN O
Description NN NN O
: NN NN O
Mesotheliom NNP NNP B-PROBLEM
a NN NN O
, NN NN O
pleural NNP NNP B-PROBLEM
effusio NNP NNP I-PROBLEM
n NN NN O
, NN NN O
atrial NNP NNP B-PROBLEM
fibrillatio NNP NNP I-PROBLEM
n NN NN O
, NN NN O
anemi NNP NNP B-PROBLEM
a NN NN O
, NN NN O
ascite NNP NNP B-PROBLEM
s NN NN O
, NN NN O
esophageal NNP NNP B-PROBLEM



### Saving CoNLL File

In [None]:
# if save_conll parameter is True then the conll file already saved in the current directory
with open("conll2003_text_file.conll", "w+", encoding='utf8') as f:
    f.write("-DOCSTART- -X- -X- O\n\n")
    f.write(conll_text)

# Reading CoNLL File by Using CoNLL Reader

In [15]:
data = CoNLL().readDataset(spark, "/content/conll2003_text_file.conll")
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sample Type / Med...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 5, Sa...|[{pos, 0, 5, NN, ...|[{named_entity, 0...|
|( Medical Transcr...|[{document, 0, 76...|[{document, 0, 76...|[{token, 0, 0, (,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|
|SECONDARY DIAGNOS...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 8, SE...|[{pos, 0, 8, NN, ...|[{named_entity, 0...|
|          PROCEDURES|[{document, 0, 9,...|[{document, 0, 9,...|[{token, 0, 9, PR...|[{pos, 0, 9, NN, ...|[{named_entity, 0...|
|1 . On August 24 ...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 0, 1,...|[{pos, 0, 0, NN, ..

In [24]:
from pyspark.sql import functions as F

data.select(F.explode(F.arrays_zip(data.token.result,
                                         data.label.result)).alias("cols")) \
          .select(F.expr("cols['0']").alias("token"),
                  F.expr("cols['1']").alias("ground_truth"))\
          .groupBy('ground_truth')\
          .count()\
          .orderBy('count', ascending=False)\
          .show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |5532 |
|I-PROBLEM   |526  |
|B-PROBLEM   |400  |
|I-TREATMENT |329  |
|B-TREATMENT |262  |
|I-TEST      |165  |
|B-TEST      |140  |
+------------+-----+

