In [3]:
# call relevant packages
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import sparknlp

# to use GPU 
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.0
Apache Spark version:  3.1.2


## Creating a composite CoNLL file 

### Create training set

In [4]:
# create the training file with news conversation
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.wb.ner") as fp:
    text = fp.readlines()

In [5]:
text = "".join(text[1:]).split("\n\n") 

In [6]:
text[:10]

['Today\tNN\t(TOP(S(NP*)\tO\non\tIN\t(PP*\tO\nthe\tDT\t(NP*\tO\nMesopotamia\tNNP\t*\tB-ORG\nchannel\tNN\t*))\tO\nthey\tPRP\t(NP*)\tO\nbrought\tVBD\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\nphoto\tNN\t*)\tO\nof\tIN\t(PP*\tO\nSaddam\tNNP\t(NP(NP*)\tB-PERSON\nin\tIN\t(PP*\tO\nthe\tDT\t(NP*\tO\ncoffin\tNN\t*))\tO\nbefore\tIN\t(PP*\tO\nburial\tNN\t(NP*))))))\tO\nThe\tDT\t(FRAG(NP(NP*\tB-PERSON\ncoast\tNN\t*)\tI-PERSON\nof\tIN\t(PP*\tI-PERSON\nJeddah\tNNP\t(NP*))))))\tI-PERSON',
 'The\tDT\t(TOP(S(NP(NP*\tO\nphoto\tNN\t*)\tO\nof\tIN\t(PP*\tO\nSaddam\tNNP\t(NP(NP*)\tB-PERSON\nminutes\tNNS\t(PP(NP*)\tO\nbefore\tIN\t*\tO\nthe\tDT\t(NP*\tO\nburial\tNN\t*)))))\tO\nwas\tVBD\t(VP*\tO\nclear\tJJ\t(ADJP*))\tO\n.\t.\t*))\tO',
 'It\tPRP\t(TOP(S(NP*)\tO\nwas\tVBD\t(VP*\tO\nas\tIN\t(SBAR*\tO\nwe\tPRP\t(S(NP*)\tO\nknow\tVBP\t(VP*\tO\nhim\tPRP\t(NP*))))\tO\n,\t,\t*\tO\nexcept\tIN\t(PP*\tO\nthat\tIN\t(SBAR*\tO\nhis\tPRP$\t(S(NP*\tO\nface\tNN\t*)\tO\nwas\tVBD\t(VP*\tO\nwhiter\tJJR\t(ADJP(ADJP*)\tO\nthan\tIN\t(SBAR*\tO\n

In [7]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [8]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [9]:
# adding telephonic conversation files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.tc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [10]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [11]:
# adding weblog files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/onto.wb.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [14]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [13]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create dev set

In [14]:
# create the dev file with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.bn.ner") as fp:
    text = fp.readlines()

In [15]:
text = "".join(text[1:]).split("\n\n") 

In [16]:
text[:10]

["The\tDT\t(TOP(S(NP(NP*\tO\nIsraeli\tJJ\t*\tB-NORP\nPrime\tNNP\t*\tO\nMinister\tNNP\t*)\tO\nEhud\tNNP\t(NP*\tB-PERSON\nBarak\tNNP\t*))\tI-PERSON\nis\tVBZ\t(VP*\tO\ndue\tJJ\t(ADJP*\tO\nto\tTO\t(S(VP*\tO\nmeet\tVB\t(VP*\tO\nwith\tIN\t(PP*\tO\nEgypt\tNNP\t(NP(NP*\tB-GPE\n's\tPOS\t*)\tO\nPresident\tNNP\t*\tO\nMubarak\tNNP\t*))\tB-PERSON\nin\tIN\t(PP*\tO\nCairo\tNNP\t(NP*))\tB-GPE\non\tIN\t(PP*\tO\nThursday\tNNP\t(NP*)))))))\tB-DATE\n.\t.\t*))\tO",
 "The\tDT\t(TOP(S(NP*\tO\nencounter\tNN\t*)\tO\nis\tVBZ\t(VP*\tO\nbeing\tVBG\t(VP*\tO\nseen\tVBN\t(VP*\tO\nas\tIN\t(PP*\tO\nan\tDT\t(NP(NP*\tO\neffort\tNN\t*)\tO\nby\tIN\t(PP*\tO\nBarak\tNNP\t(NP*))\tB-PERSON\nto\tTO\t(S(VP*\tO\ndrum\tVB\t(VP*\tO\nup\tRP\t(PRT*)\tO\nregional\tJJ\t(NP(NP*\tO\nsupport\tNN\t*)\tO\nfor\tIN\t(PP*\tO\na\tDT\t(NP(NP*\tO\nMiddle\tNNP\t(NML*\tB-LOC\nEast\tNNP\t*)\tI-LOC\npeace\tNN\t*\tO\nagreement\tNN\t*)\tO\nbased\tVBN\t(VP*\tO\non\tIN\t(PP*\tO\nPresident\tNNP\t(NP(NP*\tO\nClinton\tNNP\t*\tB-PERSON\n's\tPOS\t*)\tO\nprop

In [17]:
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [18]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [19]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [20]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [21]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [22]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [23]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### Create test set

In [24]:
# create the training filJe with all the news corpus
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bn.ner") as fp:
    text = fp.readlines()

In [25]:
text = "".join(text[1:]).split("\n\n") 

In [26]:
text[:10]

['Iraqi\tJJ\t(TOP(S(NP(NML*\tB-NORP\nleader\tNN\t*)\tO\nSaddam\tNNP\t*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\nhas\tVBZ\t(VP*\tO\ngiven\tVBN\t(VP*\tO\na\tDT\t(NP*\tO\ndefiant\tJJ\t*\tO\nspeech\tNN\t*)\tO\nto\tTO\t(S(VP*\tO\nmark\tVB\t(VP*\tO\nthe\tDT\t(NP(NP*\tO\ntenth\tJJ\t*\tB-ORDINAL\nanniversary\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tB-EVENT\nGulf\tNNP\t*\tI-EVENT\nWar\tNNP\t*))))))))\tI-EVENT\n.\t.\t*))\tO',
 'He\tPRP\t(TOP(S(NP*)\tO\nsays\tVBZ\t(VP*\tO\nIraq\tNNP\t(SBAR(S(NP*)\tB-GPE\nhas\tVBZ\t(VP*\tO\ntriumphed\tVBN\t(VP*\tO\nover\tIN\t(PP*\tO\nthe\tDT\t(NP(NP*\tO\nevil\tNN\t*)\tO\nof\tIN\t(PP*\tO\nthe\tDT\t(NP*\tO\nWest\tNNP\t*)))))))))\tB-LOC\n.\t.\t*))\tO',
 'Barbara\tNNP\t(TOP(S(NP*\tB-PERSON\nPlett\tNNP\t*)\tI-PERSON\nreports\tVBZ\t(VP*\tO\nfrom\tIN\t(PP*\tO\nBaghdad\tNNP\t(NP*)))\tB-GPE\n.\t.\t*))\tO',
 'Saddam\tNNP\t(TOP(S(NP*\tB-PERSON\nHussein\tNNP\t*)\tI-PERSON\naddressed\tVBD\t(VP*\tO\nthe\tDT\t(NP*\tO\nnation\tNN\t*)\tO\nin\tIN\t(PP*\tO\na\tDT\t(NP(NP*\tO\nspeec

In [27]:
import pandas as pd
df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], columns=["Token","Pos","Pos_special","Entity_label"])

In [28]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [29]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.mz.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [30]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [31]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.nw.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [32]:
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [33]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

### import data in CONLL format

In [34]:
# importing the training set
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/train/sample.train')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Today on the Meso...|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 4, To...|[{pos, 0, 4, NN, ...|[{named_entity, 0...|
|The photo of Sadd...|[{document, 0, 56...|[{document, 0, 56...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|It was as we know...|[{document, 0, 87...|[{document, 0, 87...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|The person who li...|[{document, 0, 94...|[{document, 0, 94...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|        My regards .|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 1, My...|[{pos, 0, 1, PRP$..

In [35]:
dev_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/development/sample.train')
dev_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The Israeli Prime...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The encounter is ...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|They require Isra...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 3, Th...|[{pos, 0, 3, PRP,...|[{named_entity, 0...|
|We have so many q...|[{document, 0, 84...|[{document, 0, 84...|[{token, 0, 1, We...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|The most importan...|[{document, 0, 40...|[{document, 0, 40...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [36]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Iraqi leader Sadd...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 4, Ir...|[{pos, 0, 4, JJ, ...|[{named_entity, 0...|
|He says Iraq has ...|[{document, 0, 53...|[{document, 0, 53...|[{token, 0, 1, He...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Barbara Plett rep...|[{document, 0, 35...|[{document, 0, 35...|[{token, 0, 6, Ba...|[{pos, 0, 6, NNP,...|[{named_entity, 0...|
|Saddam Hussein ad...|[{document, 0, 98...|[{document, 0, 98...|[{token, 0, 5, Sa...|[{pos, 0, 5, NNP,...|[{named_entity, 0...|
|`` Iraq has trium...|[{document, 0, 11...|[{document, 0, 11...|[{token, 0, 1, ``...|[{pos, 0, 1, ``, ..

### Training the model - 10 epochs

In [37]:
# use bert embeddings
bert = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["sentence",'token']).setOutputCol("bert").setCaseSensitive(True)#.setMaxSentenceLength(512)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [38]:
# transforming the training data into embeddings and saving it as parquet files
readyTrainingData = bert.transform(training_data)

readyTrainingData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_train")

In [39]:
readyTrainingData = spark.read.parquet("/tmp/conll2003/bert_train")

In [40]:
# transforming the development data into embeddings and saving it as parquet files
readyDevData = bert.transform(dev_data)

readyDevData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_dev")

In [41]:
readyDevData = spark.read.parquet("/tmp/conll2003/bert_dev")

In [42]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [43]:
readyTestData = spark.read.parquet("/tmp/conll2003/bert_test")

### Dev set - news dataset

In [44]:
# initialize NER tagger
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(10)\
.setBatchSize(4)\
.setEnableMemoryOptimizer(True)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("/tmp/conll2003/bert_dev")

In [45]:
# train the model
%time myNerModel = nerTagger.fit(readyTrainingData)

CPU times: user 481 ms, sys: 302 ms, total: 783 ms
Wall time: 2h 6min 42s


### Testing on News Dataset

In [46]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 2.32 s, sys: 436 ms, total: 2.76 s
Wall time: 20 s


In [47]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [48]:
print(count)
print(indices)

2
[88, 392]


In [49]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [50]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [51]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9730269952139295


In [52]:
print(f1_score(labels,ners))

0.873664051399271


In [53]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.84      0.88      0.86       616
        DATE       0.80      0.87      0.84      1252
       EVENT       0.65      0.59      0.62        37
         FAC       0.57      0.35      0.43        66
         GPE       0.96      0.92      0.94      1664
    LANGUAGE       0.80      0.40      0.53        10
         LAW       0.83      0.56      0.67        36
         LOC       0.69      0.70      0.70       144
       MONEY       0.89      0.90      0.89       279
        NORP       0.92      0.94      0.93       579
     ORDINAL       0.79      0.89      0.84       118
         ORG       0.85      0.89      0.87      1494
     PERCENT       0.90      0.91      0.90       293
      PERSON       0.92      0.96      0.94      1099
     PRODUCT       0.49      0.51      0.50        71
    QUANTITY       0.77      0.78      0.77        59
        TIME       0.61      0.56      0.58       106
 WORK_OF_ART       0.84    

### Testing on News Conversations

In [54]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.bc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [55]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [56]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [57]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|-- basically , it...|[{document, 0, 78...|[{document, 0, 78...|[{token, 0, 1, --...|[{pos, 0, 1, :, {...|[{named_entity, 0...|
|To express its de...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 1, To...|[{pos, 0, 1, TO, ...|[{named_entity, 0...|
|It takes time to ...|[{document, 0, 16...|[{document, 0, 16...|[{token, 0, 1, It...|[{pos, 0, 1, PRP,...|[{named_entity, 0...|
|Dear viewers , th...|[{document, 0, 52...|[{document, 0, 52...|[{token, 0, 3, De...|[{pos, 0, 3, NNP,...|[{named_entity, 0...|
|     This is Xu Li .|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 3, Th...|[{pos, 0, 3, DT, ..

In [58]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [59]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 1.21 s, sys: 316 ms, total: 1.53 s
Wall time: 55.8 s


In [60]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [61]:
print(count)
print(indices)

0
[]


In [62]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [63]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [64]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.953952228515144


In [65]:
print(f1_score(labels,ners))

0.6065911431513903


In [66]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.69      0.74      0.71       182
        DATE       0.60      0.61      0.60       200
       EVENT       0.50      0.14      0.22        14
         FAC       0.84      0.77      0.80        48
         GPE       0.46      0.77      0.57       353
         LAW       1.00      0.00      0.00         3
         LOC       0.68      0.58      0.62        26
       MONEY       1.00      1.00      1.00         3
        NORP       0.55      0.59      0.57       138
     ORDINAL       0.87      0.90      0.88        50
         ORG       0.31      0.69      0.43       153
     PERCENT       0.93      0.93      0.93        14
      PERSON       0.64      0.79      0.71       382
     PRODUCT       0.00      1.00      0.00         0
    QUANTITY       0.63      0.55      0.59        40
        TIME       0.49      0.40      0.44        63
 WORK_OF_ART       0.00      0.00      0.00        28

   micro avg       0.54   

### Testing on WebLogs

In [67]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.wb.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [68]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [69]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [70]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|The success of al...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The Source of the...|[{document, 0, 23...|[{document, 0, 23...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|The al - Jazeera ...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ...|[{named_entity, 0...|
|In this film the ...|[{document, 0, 73...|[{document, 0, 73...|[{token, 0, 1, In...|[{pos, 0, 1, IN, ...|[{named_entity, 0...|
|The Hebrew channe...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, DT, ..

In [71]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [72]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 400 ms, sys: 157 ms, total: 557 ms
Wall time: 37.4 s


In [73]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [74]:
print(count)
print(indices)

0
[]


In [75]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [76]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [77]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9676431776194246


In [78]:
print(f1_score(labels,ners))

0.761820592134335


In [79]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.82      0.73      0.77        85
        DATE       0.53      0.73      0.61        74
       EVENT       0.40      0.17      0.24        12
         FAC       0.56      0.28      0.37        18
         GPE       0.93      0.93      0.93       173
    LANGUAGE       0.50      0.25      0.33         4
         LAW       1.00      1.00      1.00         1
         LOC       0.44      0.44      0.44         9
       MONEY       0.61      0.76      0.68        25
        NORP       0.87      0.88      0.87       107
     ORDINAL       0.81      0.72      0.76        18
         ORG       0.56      0.73      0.63       117
     PERCENT       0.58      0.58      0.58        33
      PERSON       0.88      0.78      0.83       407
     PRODUCT       0.00      0.00      0.00         1
    QUANTITY       0.50      0.33      0.40         6
        TIME       0.64      0.70      0.67        20
 WORK_OF_ART       0.58    

### Testing on Telephonic Conversations

In [80]:
# adding other news files
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/onto.tc.ner") as fp:
    text = fp.readlines()
text = "".join(text[1:]).split("\n\n")

In [81]:
# creating the training data
conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
for t in range(len(text)):    
    df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
    tokens = df.Token.tolist()
    pos_labels = df.Pos.tolist()
    entity_labels = df.Entity_label.tolist()
    for token, pos, label in zip(tokens,pos_labels,entity_labels):
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
    conll_lines += "\n"

In [82]:
with open("/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train","w") as fp:
    for line in conll_lines:
        fp.write(line)

In [83]:
test_data = CoNLL().readDataset(spark, '/Users/ramyabala/Research Projects/Evaluate NER/bio/test/sample.train')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|But %um , guessed...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, Bu...|[{pos, 0, 2, CC, ...|[{named_entity, 0...|
|              What ?|[{document, 0, 5,...|[{document, 0, 5,...|[{token, 0, 3, Wh...|[{pos, 0, 3, WP, ...|[{named_entity, 0...|
|         %um The %um|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ...|[{named_entity, 0...|
|             Again ?|[{document, 0, 6,...|[{document, 0, 6,...|[{token, 0, 4, Ag...|[{pos, 0, 4, RB, ...|[{named_entity, 0...|
|%um 118.91_120.82...|[{document, 0, 32...|[{document, 0, 32...|[{token, 0, 2, %u...|[{pos, 0, 2, UH, ..

In [84]:
# transforming the test data into embeddings and saving it as parquet files
readyTestData = bert.transform(test_data)

readyTestData.write.mode("Overwrite").parquet("/tmp/conll2003/bert_test")

In [85]:
# infer from the trained model
%time results = myNerModel.transform(readyTestData).select("sentence","token","label","ner").collect()

CPU times: user 325 ms, sys: 40 ms, total: 365 ms
Wall time: 29.6 s


In [86]:
# to find exceptions where no. of labels does not match no. of ners detected
count = 0
indices = []
for i,row in enumerate(results):
    if len(row['label']) != len(row['ner']):
        count += 1
        indices.append(i)

In [87]:
print(count)
print(indices)

0
[]


In [88]:
exclusion_list = [results[t] for t in indices]
results = [results[i] for i in range(len(results)) if i not in indices]

In [89]:
tokens = []
labels = []
ners = []

for row in results:
    tokens.append([t['result'] for t in row['token']])
    labels.append([t['result'] for t in row['label']])
    ners.append([t['result'] for t in row['ner']])

In [90]:
from seqeval.metrics import accuracy_score, f1_score, classification_report
print(accuracy_score(labels,ners))

0.9788629737609329


In [91]:
print(f1_score(labels,ners))

0.7311827956989246


In [92]:
print(classification_report(labels,ners, zero_division=1))

              precision    recall  f1-score   support

    CARDINAL       0.76      0.75      0.76        52
        DATE       0.67      0.72      0.69        74
         FAC       1.00      0.00      0.00         3
         GPE       0.88      0.84      0.86        50
    LANGUAGE       1.00      0.38      0.55         8
         LOC       0.00      1.00      0.00         0
       MONEY       0.67      0.57      0.62         7
        NORP       0.77      1.00      0.87        17
     ORDINAL       0.89      0.89      0.89         9
         ORG       0.32      0.26      0.29        27
     PERCENT       1.00      0.33      0.50         6
      PERSON       0.85      0.93      0.89       100
     PRODUCT       0.00      0.00      0.00         4
    QUANTITY       0.00      1.00      0.00         0
        TIME       0.44      0.17      0.25        23

   micro avg       0.75      0.72      0.73       380
   macro avg       0.62      0.59      0.48       380
weighted avg       0.74   