In [1]:
# call relevant packages
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import sparknlp

# to use GPU 
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.0
Apache Spark version:  3.1.2


## Creating a composite CoNLL file 

### Create training set

In [2]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.bn.ner") as fp:
    text = fp.readlines()

In [3]:
text = "".join(text[1:]).split("\n\n") 

In [4]:
text[:10]

['This\tDT\t(TOP(S(NP*)\tO\nis\tVBZ\t(VP*\tO\nThe\tDT\t(NP(NP*\tB-ORG\nWorld\tNNP\t*)\tI-ORG\n,\t,\t*\tO\na\tDT\t(NP(NP*\tO\nco-production\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP(NP*\tB-ORG\nBBC\tNNP\t*\tI-ORG\nWorld\tNNP\t*\tI-ORG\nService\tNNP\t*)\tI-ORG\n,\t,\t*\tO\nPRI\tNNP\t(NP*)\tB-ORG\n,\t,\t*\tO\nand\tCC\t*\tO\nWGBH\tNNP\t(NP(NP*)\tB-ORG\nin\tIN\t(PP*\tO\nBoston\tNNP\t(NP*))))))))\tB-GPE\n.\t.\t*))\tO',
 'I\tPRP\t(TOP(S(NP*)\tO\nam\tVBP\t(VP*\tO\nLisa\tNNP\t(NP*\tB-PERSON\nMullins\tNNP\t*))\tI-PERSON\n.\t.\t*))\tO',
 "A\tDT\t(TOP(S(NP*\tO\nplan\tNN\t*)\tO\nwas\tVBD\t(VP*\tO\nannounced\tVBN\t(VP*\tO\ntoday\tNN\t(NP*)\tB-DATE\nto\tTO\t(S(VP*\tO\nraise\tVB\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\nRussian\tJJ\t*\tB-NORP\nnuclear\tJJ\t*\tO\nsubmarine\tNN\t*)\tO\n`\t''\t(NP*\tO\nKursk\tNNP\t*\tB-PRODUCT\n'\t''\t*))\tO\n,\t,\t*\tO\nnext\tJJ\t(NP(NP*\tB-DATE\nsummer\tNN\t*)\tI-DATE\n,\t,\t*\tO\nalmost\tRB\t(SBAR(NP(QP*\tB-DATE\na\tDT\t*)\tI-DATE\nyear\tNN\t*)\tI-DATE\nafter\tIN\t*\tO\nit\tPRP\

In [5]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [6]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [7]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [8]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [9]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [10]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [11]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create dev set

In [14]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.bn.ner") as fp:
    text = fp.readlines()

In [15]:
text = "".join(text[1:]).split("\n\n") 

In [16]:
text[:10]

["The\tDT\t(TOP(S(NP(NP*\tO\nIsraeli\tJJ\t*\tB-NORP\nPrime\tNNP\t*\tO\nMinister\tNNP\t*)\tO\nEhud\tNNP\t(NP*\tB-PERSON\nBarak\tNNP\t*))\tI-PERSON\nis\tVBZ\t(VP*\tO\ndue\tJJ\t(ADJP*\tO\nto\tTO\t(S(VP*\tO\nmeet\tVB\t(VP*\tO\nwith\tIN\t(PP*\tO\nEgypt\tNNP\t(NP(NP*\tB-GPE\n's\tPOS\t*)\tO\nPresident\tNNP\t*\tO\nMubarak\tNNP\t*))\tB-PERSON\nin\tIN\t(PP*\tO\nCairo\tNNP\t(NP*))\tB-GPE\non\tIN\t(PP*\tO\nThursday\tNNP\t(NP*)))))))\tB-DATE\n.\t.\t*))\tO",
 "The\tDT\t(TOP(S(NP*\tO\nencounter\tNN\t*)\tO\nis\tVBZ\t(VP*\tO\nbeing\tVBG\t(VP*\tO\nseen\tVBN\t(VP*\tO\nas\tIN\t(PP*\tO\nan\tDT\t(NP(NP*\tO\neffort\tNN\t*)\tO\nby\tIN\t(PP*\tO\nBarak\tNNP\t(NP*))\tB-PERSON\nto\tTO\t(S(VP*\tO\ndrum\tVB\t(VP*\tO\nup\tRP\t(PRT*)\tO\nregional\tJJ\t(NP(NP*\tO\nsupport\tNN\t*)\tO\nfor\tIN\t(PP*\tO\na\tDT\t(NP(NP*\tO\nMiddle\tNNP\t(NML*\tB-LOC\nEast\tNNP\t*)\tI-LOC\npeace\tNN\t*\tO\nagreement\tNN\t*)\tO\nbased\tVBN\t(VP*\tO\non\tIN\t(PP*\tO\nPresident\tNNP\t(NP(NP*\tO\nClinton\tNNP\t*\tB-PERSON\n's\tPOS\t*)\tO\nprop

In [17]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [18]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [19]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [20]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [21]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [22]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [23]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create test set

In [25]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bn.ner") as fp:
    text = fp.readlines()

In [26]:
text = "".join(text[1:]).split("\n\n") 

In [27]:
text[:10]

['Iraqi\tJJ\t(TOP(S(NP(NML*\tB-NORP\nleader\tNN\t*)\tO\nSaddam\tNNP\t*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\nhas\tVBZ\t(VP*\tO\ngiven\tVBN\t(VP*\tO\na\tDT\t(NP*\tO\ndefiant\tJJ\t*\tO\nspeech\tNN\t*)\tO\nto\tTO\t(S(VP*\tO\nmark\tVB\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\ntenth\tJJ\t*\tB-ORDINAL\nanniversary\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tB-EVENT\nGulf\tNNP\t*\tI-EVENT\nWar\tNNP\t*))))))))\tI-EVENT\n.\t.\t*))\tO',
 'He\tPRP\t(TOP(S(NP*)\tO\nsays\tVBZ\t(VP*\tO\nIraq\tNNP\t(SBAR(S(NP*)\tB-GPE\nhas\tVBZ\t(VP*\tO\ntriumphed\tVBN\t(VP*\tO\nover\tIN\t(PP*\tO\nthe\tDT\t(NP(NP*\tO\nevil\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tO\nWest\tNNP\t*)))))))))\tB-LOC\n.\t.\t*))\tO',
 'Barbara\tNNP\t(TOP(S(NP*\tB-PERSON\nPlett\tNNP\t*)\tI-PERSON\nreports\tVBZ\t(VP*\tO\nfrom\tIN\t(PP*\tO\nBaghdad\tNNP\t(NP*)))\tB-GPE\n.\t.\t*))\tO',
 'Saddam\tNNP\t(TOP(S(NP*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\naddressed\tVBD\t(VP*\tO\nthe\tDT\t(NP*\tO\nnation\tNN\t*)\tO\nin\tIN\t(PP*\tO\na\tDT\t(NP(NP*\tO\nspeec

In [28]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [29]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [30]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [31]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [32]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [33]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [34]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### import data in CONLL format

In [35]:
# importing the training set
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is The World...|[{document, 0, 88...|[{document, 0, 88...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ...|[{named_entity, 0...|
| I am Lisa Mullins .|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, I,...|[{pos, 0, 0, PRP,...|[{named_entity, 0...|
|A plan was announ...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, A,...|[{pos, 0, 0, DT, ...|[{named_entity, 0...|
|The Kursk Foundat...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The BBC 's James ...|[{document, 0, 66...|[{document, 0, 66...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [36]:
dev_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train')
dev_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The Israeli Prime...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The encounter is ...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|They require Isra...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 3, Th...|[{pos, 0, 3, PRP,...|[{named_entity, 0...|
|We have so many q...|[{document, 0, 84...|[{document, 0, 84...|[{token, 0, 1, We...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|The most importan...|[{document, 0, 40...|[{document, 0, 40...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [37]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Iraqi leader Sadd...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 4, Ir...|[{pos, 0, 4, JJ, ...|[{named_entity, 0...|
|He says Iraq has ...|[{document, 0, 53...|[{document, 0, 53...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Barbara Plett rep...|[{document, 0, 35...|[{document, 0, 35...|[{token, 0, 6, Ba...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|
|Saddam Hussein ad...|[{document, 0, 98...|[{document, 0, 98...|[{token, 0, 5, Sa...|[{pos, 0, 5, NNP,...|[{named_entity, 0...|
|`` Iraq has trium...|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 1, ``...|[{pos, 0, 1, ``, ..

### Training the model - 10 epochs

In [38]:
# use bert embeddings
bert = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["sentence",'token']).setOutputCol("bert").setCaseSensitive(True)#.setMaxSentenceLength(512)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [39]:
# transforming the training data into embeddings and saving it as parquet files
readyTrainingData = bert.transform(training_data)

readyTrainingData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_train")

In [40]:
readyTrainingData = spark.read.parquet("/tmp/conll2003/bert_train")

In [41]:
# transforming the development data into embeddings and saving it as parquet files
readyDevData = bert.transform(dev_data)

readyDevData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_dev")

In [42]:
readyDevData = spark.read.parquet("/tmp/conll2003/bert_dev")

In [43]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [44]:
readyTestData = spark.read.parquet("/tmp/conll2003/bert_test")

### Dev set - news dataset

In [45]:
# initialize NER tagger
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(10)\
.setBatchSize(4)\
.setEnableMemoryOptimizer(True)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("/tmp/conll2003/bert_dev")

In [46]:
# train the model
%time myNerModel = nerTagger.fit(readyTrainingData)

CPU times: user 636 ms, sys: 369 ms, total: 1.01 s
Wall time: 2h 3min 54s


### Testing on News Dataset

In [48]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 2.35 s, sys: 458 ms, total: 2.8 s
Wall time: 59.9 s


In [49]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [50]:
print(count)
print(indices)

2
[146, 334]


In [51]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [52]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [53]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9772023141927532


In [54]:
print(f1_score(labels,ners))

0.894753105494098


In [55]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.85      0.88      0.87       616
        DATE       0.84      0.90      0.87      1252
       EVENT       0.63      0.51      0.57        37
         FAC       0.64      0.45      0.53        66
         GPE       0.96      0.95      0.95      1664
    LANGUAGE       0.83      0.50      0.62        10
         LAW       0.69      0.56      0.62        36
         LOC       0.89      0.75      0.82       144
       MONEY       0.88      0.89      0.89       279
        NORP       0.96      0.95      0.95       579
     ORDINAL       0.81      0.94      0.87       118
         ORG       0.87      0.91      0.89      1494
     PERCENT       0.91      0.90      0.90       293
      PERSON       0.94      0.95      0.95      1099
     PRODUCT       0.75      0.73      0.74        71
    QUANTITY       0.67      0.68      0.67        59
        TIME       0.63      0.59      0.61       106
 WORK_OF_ART       0.71    

### Testing on News Conversations

In [87]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [57]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [58]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [61]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|-- basically , it...|[{document, 0, 78...|[{document, 0, 78...|[{token, 0, 1, --...|[{pos, 0, 1, :, {...|[{named_entity, 0...|
|To express its de...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|It takes time to ...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Dear viewers , th...|[{document, 0, 52...|[{document, 0, 52...|[{token, 0, 3, De...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|
|     This is Xu Li .|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ..

In [62]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [63]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 1.61 s, sys: 1.16 s, total: 2.76 s
Wall time: 1min 10s


In [64]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [65]:
print(count)
print(indices)

0
[]


In [66]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [67]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [68]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9764836247229747


In [69]:
print(f1_score(labels,ners))

0.7878430215402774


In [70]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.69      0.71      0.70       182
        DATE       0.70      0.76      0.73       200
       EVENT       0.50      0.21      0.30        14
         FAC       0.95      0.85      0.90        48
         GPE       0.90      0.92      0.91       353
         LAW       1.00      0.00      0.00         3
         LOC       0.71      0.38      0.50        26
       MONEY       0.50      1.00      0.67         3
        NORP       0.63      0.65      0.64       138
     ORDINAL       0.88      0.86      0.87        50
         ORG       0.74      0.86      0.80       153
     PERCENT       1.00      1.00      1.00        14
      PERSON       0.92      0.90      0.91       382
    QUANTITY       0.26      0.20      0.23        40
        TIME       0.68      0.57      0.62        63
 WORK_OF_ART       0.30      0.21      0.25        28

   micro avg       0.79      0.79      0.79      1697
   macro avg       0.71   

### Testing on WebLogs

In [71]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.wb.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [72]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [73]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [74]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The success of al...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The Source of the...|[{document, 0, 23...|[{document, 0, 23...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The al - Jazeera ...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|In this film the ...|[{document, 0, 73...|[{document, 0, 73...|[{token, 0, 1, In...|[{pos, 0, 1, IN, ...|[{named_entity, 0...|
|The Hebrew channe...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [75]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [76]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 392 ms, sys: 145 ms, total: 538 ms
Wall time: 32.6 s


In [77]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [78]:
print(count)
print(indices)

0
[]


In [79]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [80]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [81]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9634732119292689


In [82]:
print(f1_score(labels,ners))

0.7562879444926279


In [83]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.84      0.79      0.81        85
        DATE       0.52      0.68      0.59        74
       EVENT       0.62      0.42      0.50        12
         FAC       0.57      0.22      0.32        18
         GPE       0.90      0.94      0.92       173
    LANGUAGE       1.00      0.25      0.40         4
         LAW       0.50      1.00      0.67         1
         LOC       0.67      0.44      0.53         9
       MONEY       0.81      0.84      0.82        25
        NORP       0.90      0.86      0.88       107
     ORDINAL       0.81      0.72      0.76        18
         ORG       0.55      0.83      0.66       117
     PERCENT       0.61      0.61      0.61        33
      PERSON       0.86      0.76      0.81       407
     PRODUCT       0.00      0.00      0.00         1
    QUANTITY       0.33      0.33      0.33         6
        TIME       0.52      0.60      0.56        20
 WORK_OF_ART       0.26    

### Testing on Telephonic Conversations

In [88]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.tc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [89]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [90]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [91]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|But %um , guessed...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, Bu...|[{pos, 0, 2, CC, ...|[{named_entity, 0...|
|              What ?|[{document, 0, 5,...|[{document, 0, 5,...|[{token, 0, 3, Wh...|[{pos, 0, 3, WP, ...|[{named_entity, 0...|
|         %um The %um|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ...|[{named_entity, 0...|
|             Again ?|[{document, 0, 6,...|[{document, 0, 6,...|[{token, 0, 4, Ag...|[{pos, 0, 4, RB, ...|[{named_entity, 0...|
|%um 118.91_120.82...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ..

In [92]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [93]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 346 ms, sys: 98 ms, total: 444 ms
Wall time: 31.2 s


In [94]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [95]:
print(count)
print(indices)

0
[]


In [96]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [97]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [98]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9641946064139941


In [99]:
print(f1_score(labels,ners))

0.630841121495327


In [100]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.53      0.46      0.49        52
        DATE       0.67      0.73      0.70        74
         FAC       0.00      0.00      0.00         3
         GPE       0.78      0.92      0.84        50
    LANGUAGE       1.00      0.50      0.67         8
         LAW       0.00      1.00      0.00         0
       MONEY       1.00      0.71      0.83         7
        NORP       0.81      1.00      0.89        17
     ORDINAL       0.80      0.89      0.84         9
         ORG       0.27      0.52      0.36        27
     PERCENT       0.05      0.50      0.09         6
      PERSON       0.74      0.86      0.80       100
     PRODUCT       0.50      0.25      0.33         4
    QUANTITY       0.00      1.00      0.00         0
        TIME       0.62      0.35      0.44        23
 WORK_OF_ART       0.00      1.00      0.00         0

   micro avg       0.57      0.71      0.63       380
   macro avg       0.49   