In [25]:
# call relevant packages
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL


import pandas as pd
import os

# to use GPU 
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.2
Apache Spark version:  3.1.2


In [26]:
#CHANGE THESE APPROPRIATELY EACH TIME YOU WANT TO DO BLACKBOX TESTING OF A TEST SET
inputdir = '/home/bangaru/Downloads/NERProject2021/conll-2012-share/bio/test/'
outputdir = '/home/bangaru/Downloads/NERProject2021/spark-format-test'
testfiles = ['onto.bn.ner', 'onto.bc.ner', 'onto.mz.ner', 'onto.nw.ner', 'onto.tc.ner', 'onto.wb.ner']


In [27]:
#Converts our regular CONLL files to SparkNLP's CONLL format
def convert_format(inputpath, outputpath):
    # create the training file
    with open(inputpath) as fp:
        text = fp.readlines()
    text = "".join(text[1:]).split("\n\n") 
    df = pd.DataFrame([x.split('\t') for x in text[1].split('\n')], 
                      columns=["Token","Pos","Pos_special","Entity_label"])
    
    # creating the training data
    conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
    for t in range(len(text)):    
        df = pd.DataFrame([x.split('\t') for x in text[t].split('\n') if len(x.split('\t')) == 4], columns=["Token","Pos","Pos_special","Entity_label"])
        tokens = df.Token.tolist()
        pos_labels = df.Pos.tolist()
        entity_labels = df.Entity_label.tolist()
        for token, pos, label in zip(tokens,pos_labels,entity_labels):
            conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
        conll_lines += "\n"
        
    with open(outputpath,"w") as fp:
        for line in conll_lines:
            fp.write(line)
    
    print("Done")
    


In [28]:
#Convert all test files into SparkNLP's expected CONLL format.
for testfile in testfiles:
    convert_format(os.path.join(base_path,testfile), os.path.join(outputdir,testfile))#

Done
Done
Done
Done
Done
Done


In [29]:
#Set up the NER model
bert = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["sentence",'token']).setOutputCol("bert").setCaseSensitive(True).setMaxSentenceLength(512)
ner_onto = NerDLModel.pretrained('onto_bert_base_cased', lang='en') \
        .setInputCols(["sentence", "token", "bert"])\
        .setOutputCol("ner")

nlp_pipeline = Pipeline(stages=[bert,ner_onto])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]
onto_bert_base_cased download started this may take some time.
Approximate size to download 15.5 MB
[OK!]


In [35]:
def get_results(testfile):
    test_data = CoNLL().readDataset(spark, testfile)
    myNerModel = nlp_pipeline.fit(test_data)
    results = myNerModel.transform(test_data).select("sentence","token","label","ner").collect()
    
    #test_data.show()
    
    # to find exceptions where no. of labels does not match no. of ners detected
    count = 0
    indices = []
    for i,row in enumerate(results):
        if len(row['label']) != len(row['ner']):
            count += 1
            indices.append(i)

    print(count)
    print(indices)

    exclusion_list = [results[t] for t in indices]
    results = [results[i] for i in range(len(results)) if i not in indices]
    
    tokens = []
    labels = []
    ners = []

    for row in results:
        tokens.append([t['result'] for t in row['token']])
        labels.append([t['result'] for t in row['label']])
        ners.append([t['result'] for t in row['ner']])

    from seqeval.metrics import accuracy_score, f1_score, classification_report
    #print(accuracy_score(labels,ners))
    #print(f1_score(labels,ners))

    print(classification_report(labels,ners, zero_division=1,digits=6))


In [36]:
for testfile in testfiles:
    filepath = os.path.join(outputdir,testfile)
    print(filepath)
    get_results(filepath)
    print("**************")

/home/bangaru/Downloads/NERProject2021/spark-format-test/onto.bn.ner
0
[]
              precision    recall  f1-score   support

    CARDINAL   0.841176  0.905063  0.871951       158
        DATE   0.885417  0.864407  0.874786       295
       EVENT   0.777778  0.466667  0.583333        15
         FAC   0.772727  0.586207  0.666667        29
         GPE   0.980263  0.967532  0.973856       462
    LANGUAGE   1.000000  0.600000  0.750000         5
         LAW   1.000000  1.000000  1.000000         4
         LOC   0.850000  0.871795  0.860759        39
       MONEY   0.894737  0.850000  0.871795        20
        NORP   0.964539  0.974910  0.969697       279
     ORDINAL   0.791667  0.883721  0.835165        43
         ORG   0.858268  0.882591  0.870259       247
     PERCENT   0.833333  0.833333  0.833333         6
      PERSON   0.964444  0.977477  0.970917       444
     PRODUCT   0.755556  0.790698  0.772727        43
    QUANTITY   0.875000  0.636364  0.736842        11
       