# Overview of pipeline

The purpose of this notebook is to generate entity resolution results for a variety of combinations of Spark NLP for Healthcare models.

The algorithm will accept a dataframe of campaign urls and fund descriptions and will output a long-formatted dataframe with columns specifying each component of the algorithm.

---
### Requirements
- Google Colab Pro with high RAM runtime (Runtime tab -> Change runtime type)





# Import license keys

In [29]:
from google.colab import files
import json

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

Saving keys.json to keys (1).json


# Install and import dependencies

In [30]:
import os
import csv
import io
import pandas as pd
import numpy as np
import copy


secret = license_keys['SECRET']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID'] = license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
sparknlp_version = license_keys["PUBLIC_VERSION"]
jsl_version = license_keys["JSL_VERSION"]

print ('SparkNLP Version:', sparknlp_version)
print ('SparkNLP-JSL Version:', jsl_version)

# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==$sparknlp_version
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$secret

os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

import sparknlp
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

SparkNLP Version: 2.6.5
SparkNLP-JSL Version: 2.7.2
openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Collecting spark-nlp==2.6.5
  Using cached https://files.pythonhosted.org/packages/c6/1d/9a2a7c17fc3b3aa78b3921167feed4911d5a055833fea390e7741bba0870/spark_nlp-2.6.5-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.5


Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.7.2-7ad44c2a1a61c48b6a74446b0a7cb6b97c58dba0
Requirement already up-to-date: spark-nlp-jsl==2.7.2 in /usr/local/lib/python3.6/dist-packages (2.7.2)


# Define pipeline elements

In [65]:
document_assembler = DocumentAssembler() \
  .setInputCol('text')\
  .setOutputCol('document')

sentence_detector = SentenceDetector() \
  .setInputCols(['document'])\
  .setOutputCol('sentence')

tokenizer = Tokenizer()\
  .setInputCols(['sentence']) \
  .setOutputCol('token')

#-------------------------------------------------------------------------------
#embeddings
#-------------------------------------------------------------------------------
 
word_embeddings_clinical = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")
#-------------------------------------------------------------------------------


#-------------------------------------------------------------------------------
#NER
#-------------------------------------------------------------------------------

ner_clinical = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_diseases = NerDLModel.pretrained("ner_diseases", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

# ner_jsl = NerDLModel.pretrained("ner_jsl", "en", "clinical/models") \
#   .setInputCols(["sentence", "token", "embeddings"]) \
#   .setOutputCol("ner")

ner_jsl = NerDLModel.pretrained("jsl_ner_wip_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_jsl_wip = NerDLModel.pretrained("jsl_ner_wip_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")
#-------------------------------------------------------------------------------

ner_converter_problem = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['PROBLEM'])

ner_converter_diseases = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['Disease'])

# ner_converter_diagnosis = NerConverter() \
#   .setInputCols(["sentence", "token", "ner"]) \
#   .setOutputCol("ner_chunk")\
#   .setWhiteList(['Diagnosis'])

wl = [
'Injury_or_Poisoning', 
'Direction', 
'Test', 
'Admission_Discharge', 
'Death_Entity', 
'Relationship_Status', 
'Duration', 
'Hyperlipidemia', 
'Respiration', 
'Birth_Entity', 
'Age', 
'Family_History_Header', 
'Labour_Delivery', 
'BMI', 
'Temperature', 
'Alcohol', 
'Kidney_Disease', 
'Oncological', 
'Medical_History_Header', 
'Cerebrovascular_Disease', 
'Oxygen_Therapy', 
'O2_Saturation', 
'Psychological_Condition', 
'Heart_Disease', 
'Employment', 
'Obesity', 
'Disease_Syndrome_Disorder', 
'Pregnancy', 
'ImagingFindings', 
'Procedure', 
'Medical_Device', 
'Race_Ethnicity', 
'Section_Header', 
'Drug', 
'Symptom', 
'Treatment', 
'Substance', 
'Route', 
'Blood_Pressure', 
'Diet', 
'External_body_part_or_region', 
'LDL', 
'VS_Finding', 
'Allergen', 
'EKG_Findings', 
'Imaging_Technique', 
'Triglycerides', 
'RelativeTime', 
'Gender', 
'Pulse', 
'Social_History_Header', 
'Substance_Quantity', 
'Diabetes', 
'Modifier', 
'Internal_organ_or_component', 
'Clinical_Dept', 
'Form', 
'Strength', 
'Fetus_NewBorn,'
'RelativeDate', 
'Height', 
'Test_Result', 
'Time', 
'Frequency', 
'Sexually_Active_or_Sexual_Orientation', 
'Weight', 
'Vaccine', 
'Vital_Signs_Header', 
'Communicable_Disease', 
'Dosage', 
'Hypertension', 
'HDL', 
'Overweight', 
'Total_Cholesterol', 
'Smoking', 
'Date']

wl = [
'Injury_or_Poisoning', 
'Hyperlipidemia', 
'Kidney_Disease', 
'Oncological', 
'Cerebrovascular_Disease', 
'Psychological_Condition', 
'Heart_Disease', 
'Obesity', 
'Disease_Syndrome_Disorder', 
'Blood_Pressure', 
'Diabetes', 
'Communicable_Disease', 
'Hypertension',
'Symptom' 
]

ner_converter_diagnosis = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(wl)



chunk_embeddings = ChunkEmbeddings()\
    .setInputCols(["ner_chunk", "embeddings"])\
    .setOutputCol("chunk_embeddings")
 
icd10 = ChunkEntityResolverModel.pretrained("chunkresolve_icd10cm_clinical","en","clinical/models")\
	.setInputCols(["token","chunk_embeddings"])\
	.setOutputCol("entity")\
  .setDistanceFunction("EUCLIDEAN")
 
snomed = ChunkEntityResolverModel.pretrained("chunkresolve_snomed_findings_clinical","en","clinical/models")\
	.setInputCols(["token","chunk_embeddings"])\
	.setOutputCol("entity")\
  .setDistanceFunction("EUCLIDEAN")

#------------------------
# sentence embeddings
#-------------------------

c2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") 

sbiobert_embedder = BertSentenceEmbeddings\
  .pretrained("sbiobert_base_cased_mli",'en','clinical/models')\
  .setInputCols(["ner_chunk_doc"])\
  .setOutputCol("sbert_embeddings")

sbert_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
  .setInputCols(["ner_chunk", "sbert_embeddings"]) \
  .setOutputCol("entity")\
  .setDistanceFunction("EUCLIDEAN")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_diseases download started this may take some time.
Approximate size to download 13.7 MB
[OK!]
jsl_ner_wip_clinical download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
jsl_ner_wip_clinical download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.3 MB
[OK!]
chunkresolve_snomed_findings_clinical download started this may take some time.
Approximate size to download 162.6 MB
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented download started this may take some time.
Approximate size to download 1.2 GB
[OK!]


# Define functions

In [43]:
def RunER(ner_result, sent_embeddings, entity_resolution_model):
  '''
  input: spark dataframe of NER results
  output: pandas dataframe with nested results
  '''

  #initiate empty df
  empty_df = spark.createDataFrame([['']]).toDF('fund_description')

  #define pipeline components
  if entity_resolution_model == 'chunkresolve_icd10cm_clinical':
    pipe = Pipeline(stages=[chunk_embeddings, icd10])
  if entity_resolution_model == 'chunkresolve_snomed_findings_clinical':
    pipe = Pipeline(stages=[chunk_embeddings, snomed])
  if entity_resolution_model == 'sbiobertresolve_icd10cm_augmented':
    if sent_embeddings == 'sbiobert_base_cased_mli':
      pipe = Pipeline(stages=[ c2doc, sbiobert_embedder, sbert_resolver])
  
  #run entity resolution
  pipe_model = pipe.fit(empty_df)
  results = pipe_model.transform(ner_result)
  
  return results.toPandas()[['url','fund_description','ner_chunk','entity']]

In [44]:
def RemoveStopwords(chunks):
  results = []
  for string in chunks:
    stopwords = ['a', 'an', 'the', 'this', 'that', 'these', 'his', 'her', 'their']
    words = string.split()
    resultwords  = [word for word in words if word.lower() not in stopwords]
    r = ' '.join(resultwords)
    results.append(r)
  return results

In [45]:
def Concatenate_And(chunks):
  return ' and '.join(chunks)

In [46]:
def PrepareNewDoc(lofd, doc_reconstruction):
  
  #define results container
  r = []

  #concatenate ner terms with 'and'
  if doc_reconstruction == 'concat_and':
    for d in lofd:
      dic = copy.deepcopy(d)
      dic['chunks'] = []
      for chunk in d['ner_chunk']:
        dic['chunks'].append(chunk['result'])
        dic['chunks'] = RemoveStopwords(dic['chunks'])
        dic['new_doc'] = Concatenate_And(dic['chunks'])
      r.append(dic)
    return pd.DataFrame(r)
  
  #select sentences from original document with ner terms
  if doc_reconstruction == 'sent_with_ner':
    r = []
    for d in lofd:
      dic = copy.deepcopy(d)
      ner_chunk = dic['ner_chunk']
      sent_to_include = []
      for chunk in ner_chunk:
        sent_to_include.append(chunk['sentence'])
      dic['sent_to_include'] = sent_to_include
      r.append(dic)

    r2 = []
    for d in r:
      dic = copy.deepcopy(d)
      dic['new_doc'] = ' '.join([dic['sentence'][i]['result'] for i in dic['sent_to_include']])
      r2.append(dic)
    
    return pd.DataFrame(r2)


In [47]:
def GetChunksAndSentences(lofd):
  results = []
  for d in lofd:
    dic = {}
    dic['url'] = d['url']
    dic['fund_description'] = d['fund_description']
    dic['sentence'] = d['sentence']
    dic['num_sentences'] = len(d['sentence'])
    dic['ner_chunk'] = []
    for chunk in d['ner_chunk']:
      dic['ner_chunk'].append({'result': chunk['result'], 'sentence': int(chunk.metadata['sentence'])})
    results.append(dic)
  return results

In [48]:
def ReconstructDoc(doc_reconstruction, ner_result, ner_model, first_column):
  '''
  input: document reconstruction parameter, NER results, NER model
  output: NER results based with reconstructed input text
  '''

  #no reconstruction
  if doc_reconstruction == 'none':
    return ner_result
  
  #reconstruct document
  else:
    #clean results
    results_ner_1_df = ner_result.toPandas()
    results_ner_1_lofd = results_ner_1_df.to_dict('records') 
    results_ner_1_lofd_new = GetChunksAndSentences(results_ner_1_lofd)

    #new doc for second round of NER
    new_doc = PrepareNewDoc(results_ner_1_lofd_new, doc_reconstruction)

    #pandas df has some NaN values, convert to empty string
    new_doc = new_doc.fillna('')

    #run second NER
    new_doc = new_doc[['url','new_doc']]
    new_doc.columns = ['url','fund_description']
    ner_result_2 = RunNER(new_doc, ner_model, 'fund_description')

    return ner_result_2

In [49]:
def RunNER(feed, ner_model, first_column):
  '''
  input:
    1. pandas dataframe of feed data, col_1 = url, col_2 = fund_description
    2. string of ner model
    3. string of first column name
  output: pandas dataframe with nested results
  '''
  #initiate empty df
  empty_df = spark.createDataFrame([['']]).toDF(first_column)
  
  #load feed data into df
  df = spark.createDataFrame(feed)

  #define first entity recognition pipeline with appropriate column name
  document_assembler = DocumentAssembler() \
  .setInputCol(first_column)\
  .setOutputCol('document')
  
  
  if ner_model == 'ner_clinical':

    word_clinical_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_clinical,
      ner_converter_problem])

    ner_clinical_model = word_clinical_pipeline.fit(empty_df)
    result_clinical = ner_clinical_model.transform(df)
    return result_clinical
  
  if ner_model == 'ner_diseases':

    word_diseases_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_diseases,
      ner_converter_diseases])

    ner_diseases_model = word_diseases_pipeline.fit(empty_df)
    result_diseases = ner_diseases_model.transform(df)
    return result_diseases
  
  if ner_model == 'ner_jsl':

    word_jsl_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_jsl,
      ner_converter_diagnosis])

    ner_jsl_model = word_jsl_pipeline.fit(empty_df)
    result_jsl = ner_jsl_model.transform(df)
    return result_jsl

In [50]:
def RunPipeline(feed, combos):
  '''
  input: list of strings specifying combination of model components
  output: spark nlp pipeline
  '''

  #extract model components
  ner_model = combos[0]
  doc_reconstruction = combos[1]
  sent_embeddings = combos[2]
  er_model = combos[3]

  #run NER
  ner_result = RunNER(feed, ner_model, 'fund_description')

  #reconstruct document
  ner_result_doc = ReconstructDoc(doc_reconstruction, ner_result, ner_model, 'fund_description')

  #run ER
  er = RunER(ner_result_doc, sent_embeddings, er_model)

  #add columns specifying pipeline components
  er['ner_model'] = [ner_model for x in er['url']]
  er['doc_reconstruction'] = [doc_reconstruction for x in er['url']]
  er['sent_embeddings'] = [sent_embeddings for x in er['url']]
  er['er_model'] = [er_model for x in er['url']]
  
  #return results
  return er


In [51]:
def GetCombos(combinations):
  '''
  input: pandas dataframe of model combinations
  output: list of lists of strings, each sub-list specifies a different 
          combination of model components
  '''

  #define results container
  r = []

  #create list of lists
  for i in range(len(combinations)):
    r.append(combinations.iloc[i].tolist())
    
  #return results
  return r

In [52]:
def Main(feed, combinations):
  '''
  input: 
    1. pandas dataframe of feed data, col_1 = url, col_2 = fund_description
    2. pandas dataframe of model combinations
  output: long-format pandas dataframes for NER and ER separately
  '''

  #define results container
  r = []

  #get list of lists of combinations of model components
  combos = GetCombos(combinations)

  #Run model for each combination of model components
  n = 1
  l_c = len(combos)
  for combo in combos:
    print('starting combo {0} of {1}'.format(n, l_c))
    model_output = RunPipeline(feed, combo)
    r.append(model_output)
    #increment counter for printed progress  
    n = n + 1
  
  #concatenate results
  output = pd.concat(r, ignore_index=True)

  #process results
  #split into ner and er
  ner = output.drop(['entity'], axis=1)
  er = output.drop(['ner_chunk'], axis=1)

  #explode dataframes
  ner = ner.explode('ner_chunk')
  er = er.explode('entity')

  #add new data fields
  ner['ner_term'] = [x['result'] if x is not np.nan else np.nan for x in ner['ner_chunk']]
  ner['sentence'] = [x.metadata['sentence'] if x is not np.nan else np.nan for x in ner['ner_chunk']]
  ner['start_char'] = [x['begin'] if x is not np.nan else np.nan for x in ner['ner_chunk']]
  ner['end_char'] = [x['end'] if x is not np.nan else np.nan for x in ner['ner_chunk']]
  ner = ner.drop(['ner_chunk'], axis=1)
  #ner = ner.drop(['fund_description'], axis=1)

  er['target_text'] = [x.metadata['target_text'] if x is not np.nan else np.nan for x in er['entity']]
  er['sentence'] = [x.metadata['sentence'] if x is not np.nan else np.nan for x in er['entity']]
  er['start_char'] = [x['begin'] if x is not np.nan else np.nan for x in er['entity']]
  er['end_char'] = [x['end'] if x is not np.nan else np.nan for x in er['entity']]
  er['resolved_text'] = [x.metadata['resolved_text'] if x is not np.nan else np.nan for x in er['entity']]
  er['entity_code'] = [x['result'] if x is not np.nan else np.nan for x in er['entity']]
  er['confidence'] = [x.metadata['confidence'] if x is not np.nan else np.nan for x in er['entity']]
  er['distance'] = [x.metadata['distance'] if x is not np.nan else np.nan for x in er['entity']]
  er = er.drop(['entity'], axis=1)
  #er = er.drop(['fund_description'], axis=1)

  #return results
  return ner, er

# Import data

Feed Data

In [14]:
uploaded = files.upload()
round3 = pd.read_csv(io.BytesIO(uploaded['example_101-150.csv']))

Saving example_101-150.csv to example_101-150.csv


In [15]:
uploaded = files.upload()
round4 = pd.read_csv(io.BytesIO(uploaded['example_151-200.csv']))

Saving example_151-200.csv to example_151-200.csv


In [17]:
uploaded = files.upload()
round5 = pd.read_csv(io.BytesIO(uploaded['example_201-250.csv']))

Saving example_201-250.csv to example_201-250.csv


In [18]:
uploaded = files.upload()
round6 = pd.read_csv(io.BytesIO(uploaded['example_251-300.csv']))

Saving example_251-300.csv to example_251-300.csv


In [55]:
merge = pd.concat([round3, round4, round5, round6])

Model combinations

In [20]:
uploaded = files.upload()
combinations = pd.read_csv(io.BytesIO(uploaded['model_combinations.csv']))

Saving model_combinations.csv to model_combinations.csv


In [60]:
combinations.iloc[[9]]

Unnamed: 0,ner_model,doc_reconstruction,embeddings,entity_resolution_model
9,ner_jsl,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented


# Execute functions

In [66]:
ner, er = Main(merge, combinations.iloc[[9]])

starting combo 1 of 1


# Export data

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [67]:
er.to_csv('/content/drive/My Drive/Crowdfunding/er_3-6_new_jsl_3.csv', index=False)
#ner.to_csv('/content/drive/My Drive/Crowdfunding/ner_3-6_new_jsl.csv', index=False)

# Model evaluation

In [None]:
er['score'] = [1 if x is not np.nan else 0 for x in er['target_text']]

In [None]:
def ModelEval(er, combinations):

  combos = GetCombos(combinations)

  input_docs = []
  ner_terms_count = []
  docs_with_no_ner = []
  ner_terms_per_doc_mean = []
  ner_terms_per_doc_sd = []
  er_confidence_mean = []
  er_distance_mean = []

  for combo in combos:
    df = er.loc[(er['ner_model'] == combo[0]) & (er['doc_reconstruction'] == combo[1]) & (er['sent_embeddings'] == combo[2]) & (er['er_model'] == combo[3]),]
    
    #number of input dociments
    input_docs.append(50)

    #number of NAs for NER terms
    num_na = df['target_text'].isna().sum()

    #total number of NER terms
    ner_terms_count.append(df.shape[0] - num_na)
    
    #NER terms by document
    ner_by_url = df[['url','score']].groupby('url').agg('sum')
    docs_with_no_ner.append(pd.Series(ner_by_url['score'] == 0).sum())
    ner_terms_per_doc_mean.append(ner_by_url['score'].mean())
    ner_terms_per_doc_sd.append(ner_by_url['score'].std())
    
    #ER metrics
    er_confidence_mean.append(df['confidence'].astype(float).mean())
    er_distance_mean.append(df['distance'].astype(float).mean())
  
  r = pd.DataFrame({'input_docs':input_docs,
                    'ner_terms_count':ner_terms_count,
                    'docs_with_no_ner':docs_with_no_ner,
                    'ner_terms_per_doc_mean':ner_terms_per_doc_mean,
                    'ner_terms_per_doc_sd':ner_terms_per_doc_sd,
                    'er_confidence_mean':er_confidence_mean,
                    'er_distance_mean':er_distance_mean})
  return pd.concat([combinations, r], axis=1)
  

In [None]:
ModelEval(er, combinations)

Unnamed: 0,ner_model,doc_reconstruction,embeddings,entity_resolution_model,input_docs,ner_terms_count,docs_with_no_ner,ner_terms_per_doc_mean,ner_terms_per_doc_sd,er_confidence_mean,er_distance_mean
0,ner_clinical,none,none,chunkresolve_icd10cm_clinical,50,437,3,8.74,10.86618,0.062511,1.01556
1,ner_clinical,concat_and,none,chunkresolve_icd10cm_clinical,50,413,3,8.26,10.538191,0.065614,0.963053
2,ner_clinical,sent_with_ner,none,chunkresolve_icd10cm_clinical,50,923,3,18.46,31.883212,0.063406,0.972786
3,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,437,3,8.74,10.86618,0.375657,6.381022
4,ner_clinical,concat_and,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,413,3,8.26,10.538191,0.436572,5.756073
5,ner_clinical,sent_with_ner,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,923,3,18.46,31.883212,0.404267,6.015442
6,ner_jsl,none,none,chunkresolve_icd10cm_clinical,50,113,16,2.26,3.983691,0.066368,0.580966
7,ner_jsl,concat_and,none,chunkresolve_icd10cm_clinical,50,103,16,2.06,3.924855,0.066749,0.566989
8,ner_jsl,sent_with_ner,none,chunkresolve_icd10cm_clinical,50,204,16,4.08,10.703499,0.066055,0.596892
9,ner_jsl,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,113,16,2.26,3.983691,0.724112,3.072292


Why do some sent_with_ner have like double the numnber of ner terms?