In [1]:
# call relevant packages
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import sparknlp

# to use GPU 
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.0
Apache Spark version:  3.1.2


## Creating a composite CoNLL file 

### Create training set

In [2]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.bn.ner") as fp:
    text = fp.readlines()

In [3]:
text = "".join(text[1:]).split("\n\n") 

In [4]:
text[:10]

['This\tDT\t(TOP(S(NP*)\tO\nis\tVBZ\t(VP*\tO\nThe\tDT\t(NP(NP*\tB-ORG\nWorld\tNNP\t*)\tI-ORG\n,\t,\t*\tO\na\tDT\t(NP(NP*\tO\nco-production\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP(NP*\tB-ORG\nBBC\tNNP\t*\tI-ORG\nWorld\tNNP\t*\tI-ORG\nService\tNNP\t*)\tI-ORG\n,\t,\t*\tO\nPRI\tNNP\t(NP*)\tB-ORG\n,\t,\t*\tO\nand\tCC\t*\tO\nWGBH\tNNP\t(NP(NP*)\tB-ORG\nin\tIN\t(PP*\tO\nBoston\tNNP\t(NP*))))))))\tB-GPE\n.\t.\t*))\tO',
 'I\tPRP\t(TOP(S(NP*)\tO\nam\tVBP\t(VP*\tO\nLisa\tNNP\t(NP*\tB-PERSON\nMullins\tNNP\t*))\tI-PERSON\n.\t.\t*))\tO',
 "A\tDT\t(TOP(S(NP*\tO\nplan\tNN\t*)\tO\nwas\tVBD\t(VP*\tO\nannounced\tVBN\t(VP*\tO\ntoday\tNN\t(NP*)\tB-DATE\nto\tTO\t(S(VP*\tO\nraise\tVB\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\nRussian\tJJ\t*\tB-NORP\nnuclear\tJJ\t*\tO\nsubmarine\tNN\t*)\tO\n`\t''\t(NP*\tO\nKursk\tNNP\t*\tB-PRODUCT\n'\t''\t*))\tO\n,\t,\t*\tO\nnext\tJJ\t(NP(NP*\tB-DATE\nsummer\tNN\t*)\tI-DATE\n,\t,\t*\tO\nalmost\tRB\t(SBAR(NP(QP*\tB-DATE\na\tDT\t*)\tI-DATE\nyear\tNN\t*)\tI-DATE\nafter\tIN\t*\tO\nit\tPRP\

In [5]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [6]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [7]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [8]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [9]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [10]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [11]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create dev set

In [12]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.wb.ner") as fp:
    text = fp.readlines()

In [13]:
text = "".join(text[1:]).split("\n\n") 

In [14]:
text[:10]

['Abd\tNNP\t(TOP(NP(NP*\tB-PERSON\nal\tNNP\t*\tI-PERSON\n-\tHYPH\t*\tI-PERSON\nBari\tNNP\t*\tI-PERSON\nAtwan\tNNP\t*)\tI-PERSON\n:\t:\t*\tO\n-LRB-\t-LRB-\t(NP*\tO\nPresident\tNNP\t*\tO\nSaddam\tNNP\t*\tB-PERSON\n-RRB-\t-RRB-\t*)\tO\nGreat\tJJ\t(ADJP*\tO\nin\tIN\t(PP*\tO\nHis\tPRP$\t(NP*\tO\nLife\tNN\t*)))\tO\n..\tNFP\t*\tO\nGreat\tJJ\t(ADJP*\tO\nin\tIN\t(PP*\tO\nHis\tPRP$\t(NP*\tO\nMartyrdom\tNN\t*)))))\tO',
 'Asad\tNNP\t(TOP(FRAG(NP*)))\tB-PERSON',
 '1/02/2007\tNN\t(TOP(NP*))\tO',
 'This\tDT\t(TOP(S(NP(NP*\tO\nunprecedented\tJJ\t*\tO\nArab\tJJ\t(ADJP*\tB-NORP\nand\tCC\t*\tO\ninternational\tJJ\t*)\tO\ninterest\tNN\t*)\tO\nin\tIN\t(PP*\tO\nthe\tDT\t(NP(NP(NP*\tO\nexecution\tNN\t*)\tO\nof\tIN\t(PP*\tO\nIraqi\tJJ\t(NP(NML*\tB-NORP\nPresident\tNNP\t*)\tO\nSaddam\tNNP\t*\tB-PERSON\nHussein\tNNP\t*)))\tI-PERSON\nand\tCC\t*\tO\nthe\tDT\t(NP(NP*\tO\nbarbaric\tJJ\t*\tO\nway\tNN\t*)\tO\nin\tIN\t(SBAR(WHPP*\tO\nwhich\tWDT\t(WHNP*))\tO\nit\tPRP\t(S(NP*)\tO\nwas\tVBD\t(VP*\tO\ncarried\tVBN\t(VP*\tO

In [15]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [16]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [17]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create test set

In [18]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bn.ner") as fp:
    text = fp.readlines()

In [19]:
text = "".join(text[1:]).split("\n\n") 

In [20]:
text[:10]

['Iraqi\tJJ\t(TOP(S(NP(NML*\tB-NORP\nleader\tNN\t*)\tO\nSaddam\tNNP\t*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\nhas\tVBZ\t(VP*\tO\ngiven\tVBN\t(VP*\tO\na\tDT\t(NP*\tO\ndefiant\tJJ\t*\tO\nspeech\tNN\t*)\tO\nto\tTO\t(S(VP*\tO\nmark\tVB\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\ntenth\tJJ\t*\tB-ORDINAL\nanniversary\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tB-EVENT\nGulf\tNNP\t*\tI-EVENT\nWar\tNNP\t*))))))))\tI-EVENT\n.\t.\t*))\tO',
 'He\tPRP\t(TOP(S(NP*)\tO\nsays\tVBZ\t(VP*\tO\nIraq\tNNP\t(SBAR(S(NP*)\tB-GPE\nhas\tVBZ\t(VP*\tO\ntriumphed\tVBN\t(VP*\tO\nover\tIN\t(PP*\tO\nthe\tDT\t(NP(NP*\tO\nevil\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tO\nWest\tNNP\t*)))))))))\tB-LOC\n.\t.\t*))\tO',
 'Barbara\tNNP\t(TOP(S(NP*\tB-PERSON\nPlett\tNNP\t*)\tI-PERSON\nreports\tVBZ\t(VP*\tO\nfrom\tIN\t(PP*\tO\nBaghdad\tNNP\t(NP*)))\tB-GPE\n.\t.\t*))\tO',
 'Saddam\tNNP\t(TOP(S(NP*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\naddressed\tVBD\t(VP*\tO\nthe\tDT\t(NP*\tO\nnation\tNN\t*)\tO\nin\tIN\t(PP*\tO\na\tDT\t(NP(NP*\tO\nspeec

In [21]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [22]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [23]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [24]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [25]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [26]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [27]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### import data in CONLL format

In [28]:
# importing the training set
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is The World...|[{document, 0, 88...|[{document, 0, 88...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ...|[{named_entity, 0...|
| I am Lisa Mullins .|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, I,...|[{pos, 0, 0, PRP,...|[{named_entity, 0...|
|A plan was announ...|[{document, 0, 18...|[{document, 0, 18...|[{token, 0, 0, A,...|[{pos, 0, 0, DT, ...|[{named_entity, 0...|
|The Kursk Foundat...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The BBC 's James ...|[{document, 0, 66...|[{document, 0, 66...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [29]:
dev_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train')
dev_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Abd al - Bari Atw...|[{document, 0, 93...|[{document, 0, 93...|[{token, 0, 2, Ab...|[{pos, 0, 2, NNP,...|[{named_entity, 0...|
|                Asad|[{document, 0, 3,...|[{document, 0, 3,...|[{token, 0, 3, As...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|
|           1/02/2007|[{document, 0, 8,...|[{document, 0, 8,...|[{token, 0, 8, 1/...|[{pos, 0, 8, NN, ...|[{named_entity, 0...|
|This unprecedente...|[{document, 0, 27...|[{document, 0, 27...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ...|[{named_entity, 0...|
|The American admi...|[{document, 0, 34...|[{document, 0, 34...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [30]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Iraqi leader Sadd...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 4, Ir...|[{pos, 0, 4, JJ, ...|[{named_entity, 0...|
|He says Iraq has ...|[{document, 0, 53...|[{document, 0, 53...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Barbara Plett rep...|[{document, 0, 35...|[{document, 0, 35...|[{token, 0, 6, Ba...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|
|Saddam Hussein ad...|[{document, 0, 98...|[{document, 0, 98...|[{token, 0, 5, Sa...|[{pos, 0, 5, NNP,...|[{named_entity, 0...|
|`` Iraq has trium...|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 1, ``...|[{pos, 0, 1, ``, ..

### Training the model - 10 epochs

In [31]:
# use bert embeddings
bert = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["sentence",'token']).setOutputCol("bert").setCaseSensitive(True)#.setMaxSentenceLength(512)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [32]:
# transforming the training data into embeddings and saving it as parquet files
readyTrainingData = bert.transform(training_data)

readyTrainingData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_train")

In [33]:
readyTrainingData = spark.read.parquet("/tmp/conll2003/bert_train")

In [34]:
# transforming the development data into embeddings and saving it as parquet files
readyDevData = bert.transform(dev_data)

readyDevData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_dev")

In [35]:
readyDevData = spark.read.parquet("/tmp/conll2003/bert_dev")

In [36]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [37]:
readyTestData = spark.read.parquet("/tmp/conll2003/bert_test")

### Dev set - news dataset

In [38]:
# initialize NER tagger
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(10)\
.setBatchSize(4)\
.setEnableMemoryOptimizer(True)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("/tmp/conll2003/bert_dev")

In [39]:
# train the model
%time myNerModel = nerTagger.fit(readyTrainingData)

CPU times: user 803 ms, sys: 533 ms, total: 1.34 s
Wall time: 8h 57min 44s


### Testing on News Dataset

In [40]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 2.17 s, sys: 363 ms, total: 2.53 s
Wall time: 21 s


In [41]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [42]:
print(count)
print(indices)

2
[88, 392]


In [43]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [44]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [45]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9766470856051437


In [46]:
print(f1_score(labels,ners))

0.8922542204568022


In [48]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.85      0.85      0.85       616
        DATE       0.83      0.89      0.86      1252
       EVENT       0.73      0.65      0.69        37
         FAC       0.70      0.47      0.56        66
         GPE       0.97      0.94      0.95      1664
    LANGUAGE       0.71      0.50      0.59        10
         LAW       0.77      0.56      0.65        36
         LOC       0.84      0.75      0.79       144
       MONEY       0.89      0.90      0.90       279
        NORP       0.96      0.94      0.95       579
     ORDINAL       0.80      0.91      0.85       118
         ORG       0.87      0.90      0.88      1494
     PERCENT       0.92      0.92      0.92       293
      PERSON       0.94      0.95      0.95      1099
     PRODUCT       0.64      0.55      0.59        71
    QUANTITY       0.70      0.78      0.74        59
        TIME       0.57      0.64      0.60       106
 WORK_OF_ART       0.84    

### Testing on News Conversations

In [49]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [50]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [51]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [52]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|-- basically , it...|[{document, 0, 78...|[{document, 0, 78...|[{token, 0, 1, --...|[{pos, 0, 1, :, {...|[{named_entity, 0...|
|To express its de...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|It takes time to ...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Dear viewers , th...|[{document, 0, 52...|[{document, 0, 52...|[{token, 0, 3, De...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|
|     This is Xu Li .|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ..

In [53]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [54]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 1.28 s, sys: 388 ms, total: 1.67 s
Wall time: 1min 1s


In [55]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [56]:
print(count)
print(indices)

0
[]


In [57]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [58]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [59]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9754063038660429


In [60]:
print(f1_score(labels,ners))

0.7887240356083086


In [61]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.71      0.74      0.73       182
        DATE       0.73      0.79      0.76       200
       EVENT       0.50      0.14      0.22        14
         FAC       0.93      0.83      0.88        48
         GPE       0.91      0.91      0.91       353
         LAW       1.00      0.00      0.00         3
         LOC       1.00      0.38      0.56        26
       MONEY       1.00      1.00      1.00         3
        NORP       0.62      0.63      0.62       138
     ORDINAL       0.88      0.86      0.87        50
         ORG       0.73      0.82      0.77       153
     PERCENT       0.93      0.93      0.93        14
      PERSON       0.93      0.91      0.92       382
    QUANTITY       0.21      0.15      0.17        40
        TIME       0.52      0.56      0.54        63
 WORK_OF_ART       0.30      0.11      0.16        28

   micro avg       0.79      0.78      0.79      1697
   macro avg       0.74   

### Testing on WebLogs

In [62]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.wb.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [63]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [64]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [65]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The success of al...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The Source of the...|[{document, 0, 23...|[{document, 0, 23...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The al - Jazeera ...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|In this film the ...|[{document, 0, 73...|[{document, 0, 73...|[{token, 0, 1, In...|[{pos, 0, 1, IN, ...|[{named_entity, 0...|
|The Hebrew channe...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [66]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [67]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 424 ms, sys: 346 ms, total: 771 ms
Wall time: 34.1 s


In [68]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [69]:
print(count)
print(indices)

0
[]


In [70]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [71]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [72]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9631037212984956


In [73]:
print(f1_score(labels,ners))

0.7555749890686488


In [74]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.83      0.74      0.78        85
        DATE       0.46      0.62      0.53        74
       EVENT       0.43      0.25      0.32        12
         FAC       0.60      0.17      0.26        18
         GPE       0.93      0.94      0.93       173
    LANGUAGE       0.25      0.25      0.25         4
         LAW       1.00      1.00      1.00         1
         LOC       1.00      0.56      0.71         9
       MONEY       0.66      0.76      0.70        25
        NORP       0.92      0.92      0.92       107
     ORDINAL       0.80      0.67      0.73        18
         ORG       0.55      0.85      0.67       117
     PERCENT       0.58      0.58      0.58        33
      PERSON       0.85      0.76      0.81       407
     PRODUCT       0.00      0.00      0.00         1
    QUANTITY       0.50      0.33      0.40         6
        TIME       0.46      0.65      0.54        20
 WORK_OF_ART       0.43    

### Testing on Telephonic Conversations

In [75]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.tc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [76]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [77]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [78]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|But %um , guessed...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, Bu...|[{pos, 0, 2, CC, ...|[{named_entity, 0...|
|              What ?|[{document, 0, 5,...|[{document, 0, 5,...|[{token, 0, 3, Wh...|[{pos, 0, 3, WP, ...|[{named_entity, 0...|
|         %um The %um|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ...|[{named_entity, 0...|
|             Again ?|[{document, 0, 6,...|[{document, 0, 6,...|[{token, 0, 4, Ag...|[{pos, 0, 4, RB, ...|[{named_entity, 0...|
|%um 118.91_120.82...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ..

In [79]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [80]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 346 ms, sys: 325 ms, total: 671 ms
Wall time: 30.2 s


In [81]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [82]:
print(count)
print(indices)

0
[]


In [83]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [84]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [85]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9677478134110787


In [86]:
print(f1_score(labels,ners))

0.6472019464720195


In [87]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.50      0.44      0.47        52
        DATE       0.55      0.78      0.64        74
         FAC       1.00      0.00      0.00         3
         GPE       0.90      0.88      0.89        50
    LANGUAGE       1.00      0.75      0.86         8
         LOC       0.00      1.00      0.00         0
       MONEY       1.00      0.71      0.83         7
        NORP       0.81      1.00      0.89        17
     ORDINAL       0.75      1.00      0.86         9
         ORG       0.28      0.37      0.32        27
     PERCENT       0.07      0.50      0.13         6
      PERSON       0.85      0.85      0.85       100
     PRODUCT       0.00      0.00      0.00         4
    QUANTITY       0.00      1.00      0.00         0
        TIME       0.43      0.26      0.32        23
 WORK_OF_ART       0.00      1.00      0.00         0

   micro avg       0.60      0.70      0.65       380
   macro avg       0.51   