In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start Spark Session with Spark NLP
# start() functions has two parameters: gpu and spark23
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
spark = sparknlp.start()

In [2]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 2.7.3
Apache Spark version: 2.4.4


In [3]:
import pathlib 
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.*")]
# create destination text file
for f in flist:
    filename = "test_" + pathlib.Path(f).name.split(".")[0] + ".csv"
    print(filename)

test_onto_nw_ner.csv
test_onto_bc_ner.csv
test_onto_wb_ner.csv
test_onto_mz_ner.csv
test_onto_pt_ner.csv
test_onto_bn_ner.csv
test_onto_tc_ner.csv


In [4]:
import csv
import pandas as pd
df = pd.read_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt",delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])

In [5]:
df.shape

(32488, 5)

In [6]:
df.columns

Index(['Word', 'POS', 'DEREP', 'TYPE', 'SENT_NO'], dtype='object')

In [7]:
df.head()

Unnamed: 0,Word,POS,DEREP,TYPE,SENT_NO
0,--,:,(TOP(S*,O,1
1,basically,RB,(ADVP*),O,1
2,",",",",*,O,1
3,it,PRP,(NP*),O,1
4,was,VBD,(VP*,O,1


In [8]:
import time
sentences = []
entities = []
entities_type = []
print("there are {} sentences".format(len(df.groupby("SENT_NO").groups.items())))
count = 1
start = time.time()
for _,v in df.groupby("SENT_NO").groups.items():
    temp1 = []
    temp2 = []
    temp3 = []
    if count%1000 == 0:
        print("Done for {} in {} seconds".format(count,time.time()-start))
    for i,t in enumerate(df.iloc[v,:].Word.tolist()):
        if i < (len(df.iloc[v,:].Word.tolist())-1):
            if df.iloc[v,:].Word.tolist()[i][0].isalnum() and not(df.iloc[v,:].Word.tolist()[i+1][0].isalnum()):
                temp1.append(t + df.iloc[v,:].Word.tolist()[i+1])
                temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                temp2.append(df.iloc[v,:].TYPE.tolist()[i+1])
                temp3.append(df.iloc[v,:].Word.tolist()[i])
                temp3.append(df.iloc[v,:].Word.tolist()[i+1])
            elif not(df.iloc[v,:].Word.tolist()[i][0].isalnum()):
                continue
            else:
                temp1.append(t)
                temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                temp3.append(df.iloc[v,:].Word.tolist()[i])
        elif i == (len(df.iloc[v,:].Word.tolist())-1):
            if t[0].isalnum():
                temp1.append(t)
                temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                temp3.append(df.iloc[v,:].Word.tolist()[i])
    sentences.append(" ".join(temp1))
    entities_type.append(temp2)
    entities.append(temp3)
    count += 1   

there are 2037 sentences
Done for 1000 in 25.955174922943115 seconds
Done for 2000 in 48.165326833724976 seconds


In [9]:
len(entities_type)

2037

In [10]:
sentences[0]

'basically, it was unanimously agreed upon by the various relevant parties.'

In [11]:
entities_type[0]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [12]:
entities[0]

['basically',
 ',',
 'it',
 'was',
 'unanimously',
 'agreed',
 'upon',
 'by',
 'the',
 'various',
 'relevant',
 'parties',
 '.']

In [13]:
len(sentences)

2037

In [14]:
len(entities)

2037

In [15]:
len(entities_type)

2037

In [16]:
sentences[0:10]

['basically, it was unanimously agreed upon by the various relevant parties.',
 'To express its determination, the Chinese securities regulatory department compares this stock reform to a die that has been cast.',
 'It takes time to prove whether the stock reform can really meet expectations, and whether any deviations that arise during the stock reform can be promptly corrected.',
 'Dear viewers, the China News program will end here.',
 'This is Xu Li.',
 'Thank you everyone for watching.',
 'Coming up is the Focus Today program hosted by Wang Shilin.',
 'Good-bye, dear viewers.',
 'Hello, dear viewers.',
 'Welcome to Focus Today.']

In [17]:
embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \
      .setInputCols("sentence", "token") \
      .setOutputCol("embeddings")

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [18]:
ner_onto = NerDLModel.pretrained("onto_bert_base_cased", "en") \
        .setInputCols(["document", "token", "embeddings"]) \
        .setOutputCol("ner")

onto_bert_base_cased download started this may take some time.
Approximate size to download 15.5 MB
[OK!]


In [19]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token").setSplitChars(['-']).setContextChars(['(', ')', '?', '!', '.']) 

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings, ner_onto])

In [20]:
pipeline_model = nlp_pipeline.fit(spark.createDataFrame([['']]).toDF('text'))

In [21]:
text = [" ".join(sentences)]

In [253]:
df1 = spark.createDataFrame(pd.DataFrame({'text':['All the indications we are getting even/.']}))

In [254]:
result1 = pipeline_model.transform(df1)

In [255]:
df1_result = result1.select("sentence","ner").collect()

In [257]:
df1_result[0][0]

[Row(annotatorType='document', begin=0, end=40, result='All the indications we are getting even/.', metadata={'sentence': '0'}, embeddings=[])]

In [258]:
df1_result[0][1]

[Row(annotatorType='named_entity', begin=0, end=2, result='O', metadata={'word': 'All'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=4, end=6, result='O', metadata={'word': 'the'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=8, end=18, result='O', metadata={'word': 'indications'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=20, end=21, result='O', metadata={'word': 'we'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=23, end=25, result='O', metadata={'word': 'are'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=27, end=33, result='O', metadata={'word': 'getting'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=35, end=39, result='O', metadata={'word': 'even/'}, embeddings=[]),
 Row(annotatorType='named_entity', begin=40, end=40, result='O', metadata={'word': '.'}, embeddings=[])]

In [33]:
" ".join([])

''

In [252]:
import time
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.*")]
for f in flist:
    print("doing for " + f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])
    sentences = []
    entities = []
    entities_type = []
    print("there are {} sentences".format(len(df.groupby("SENT_NO").groups.items())))
    count = 1
    start = time.time()
    for _,v in df.groupby("SENT_NO").groups.items():
        temp1 = []
        temp2 = []
        temp3 = []
        if count%1000 == 0:
            print("Done for {} in {} seconds".format(count,time.time()-start))
        for i,t in enumerate(df.iloc[v,:].Word.tolist()):
            if i < (len(df.iloc[v,:].Word.tolist())-1):
                if df.iloc[v,:].Word.tolist()[i][0].isalnum() and not(df.iloc[v,:].Word.tolist()[i+1][0].isalnum()):
                    temp1.append(t + df.iloc[v,:].Word.tolist()[i+1])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i+1])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i+1])
                elif not(df.iloc[v,:].Word.tolist()[i][0].isalnum()):
                    continue
                else:
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
            elif i == (len(df.iloc[v,:].Word.tolist())-1):
                if t[0].isalnum():
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
        sentences.append(" ".join(temp1))
        entities_type.append(temp2)
        entities.append(temp3)
        count += 1
    sentences =  [s for s in sentences if s != ""]
    entities_type =  [s for s in entities_type if s != []]
    entities =  [s for s in entities if s != []]
    detected_sent = []
    detected_ner = []
    detected_ner_type = []
    start = time.time()
    for sent in sentences:
        df = spark.createDataFrame(pd.DataFrame({'text':[sent]}))
        result = pipeline_model.transform(df)    
        df_result = result.select("sentence","ner").collect()
    
        detected_sent.append([df_result[0][0][0].result])
        ners = []
        ner_types = []
        for d in df_result[0][1]:
            if d.result != "O":
                ners.append(d.metadata["word"])
                ner_types.append(d.result)
        detected_ner.append(ners)
        detected_ner_type.append(ner_types)
    print("Done processing in {} seconds".format(time.time()-start))
    temp1 = []
    temp2 = []
    for i,ent in enumerate(entities_type):
        temp3 = []
        temp4 = []
        temp = entities[i]
        for j,e in enumerate(ent):
            if e != "O":
                temp3.append(e)
                temp4.append(temp[j])
        temp1.append(temp3)
        temp2.append(temp4)
    entities_type = temp1
    entities = temp2
    pd.DataFrame({"Detected_sentence":detected_sent,"Actual_ners":entities,"Detected_ners":detected_ner,"Actual_ner_types":entities_type,"Detected_ner_type":detected_ner_type}).to_csv("detect_" + pathlib.Path(f).name.split(".")[0] + ".csv",index = False)
    
    
    

doing for /Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt
there are 1898 sentences
Done for 1000 in 38.39056992530823 seconds
Done processing in 335.48407888412476 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt
there are 2037 sentences
Done for 1000 in 25.852938890457153 seconds
Done for 2000 in 47.65314197540283 seconds
Done processing in 319.0441451072693 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt
there are 929 sentences
Done processing in 152.57075905799866 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt
there are 780 sentences
Done processing in 129.80695486068726 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt
there are 1217 sentences
Done for 1000 in 20.549272060394287 seconds
Done processing in 180.71434593200684 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt
there are 1252 sentences
Done for 1000 in 28.651089191436768 seconds
Done proc

In [116]:
from nltk.tokenize.treebank import TreebankWordDetokenizer  

In [37]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s eta 0:00:01
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16170 sha256=df7c4b914366fb497c0d7867e83422ee1085daca5bca873202f620bfdba273c4
  Stored in directory: /Users/ramybal/Library/Caches/pip/wheels/39/29/36/1c4f7905c133e11748ca375960154964082d4fb03478323089
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [136]:
path = pathlib.Path("/Users/ramybal/Desktop/untitled folder/neuroner/")
flist = [str(f) for f in path.rglob("detect_*.csv")]
flist

['/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner.csv',
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv']

In [159]:
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist1 = [str(f) for f in path.rglob("*.*")]
path = pathlib.Path("/Users/ramybal/Desktop/untitled folder/neuroner/")
flist = [str(f) for f in path.rglob("detect*.csv")]

In [154]:
mapping = {}
for f in flist1:
    for f1 in flist:
        if f1.split("/")[-1].replace("detect_","") == f.split("/")[-1].replace("txt","csv"):
            mapping[f] = f1
            break

In [155]:
mapping

{'/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt': '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv'}

In [156]:
llist = ["'you','twin'"]
[l.replace("'","") for l in llist[0].split(",")]

['you', 'twin']

In [191]:
f = flist[0]
df = pd.read_csv(f,encoding="utf-8")
df.drop('Unnamed: 0',axis=1,inplace=True)

In [192]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_type
0,['Iraqi leader Saddam Hussein has given a defi...,"['Iraqi', 'Saddam', 'Hussein', 'tenth', 'the',...","['Iraqi', 'Saddam', 'Hussein', 'tenth', 'the',...","['B-NORP', 'B-PERSON', 'I-PERSON', 'B-ORDINAL'...","['B-NORP', 'B-PERSON', 'I-PERSON', 'B-ORDINAL'..."
1,['He says Iraq has triumphed over the evil of ...,"['Iraq', 'West']","['Iraq', 'West']","['B-GPE', 'B-LOC']","['B-GPE', 'B-LOC']"
2,['Barbara Plett reports from Baghdad.'],"['Barbara', 'Plett', 'Baghdad']","['Barbara', 'Plett', 'Baghdad']","['B-PERSON', 'I-PERSON', 'B-GPE']","['B-PERSON', 'I-PERSON', 'B-GPE']"
3,['Saddam Hussein addressed the nation in a spe...,"['Saddam', 'Hussein']","['Saddam', 'Hussein']","['B-PERSON', 'I-PERSON']","['B-PERSON', 'I-PERSON']"
4,"['Iraq has triumphed over its enemies, he said...",['Iraq'],['Iraq'],['B-GPE'],['B-GPE']


In [204]:
import yaml

for f in flist:
    df = pd.read_csv(f,encoding="utf-8")
    df.drop('Unnamed: 0',axis=1,inplace=True)
    df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
    df[['Detected_ner_type']] = df[['Detected_ner_type']].applymap(yaml.safe_load) 
    df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
    df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
    df.to_csv(f,index=False)

In [208]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_type
0,['Iraqi leader Saddam Hussein has given a defi...,"[Iraqi, Saddam, Hussein, tenth, the, Gulf, War]","[Iraqi, Saddam, Hussein, tenth, the, Gulf, War]","[B-NORP, B-PERSON, I-PERSON, B-ORDINAL, B-EVEN...","[B-NORP, B-PERSON, I-PERSON, B-ORDINAL, B-EVEN..."
1,['He says Iraq has triumphed over the evil of ...,"[Iraq, West]","[Iraq, West]","[B-GPE, B-LOC]","[B-GPE, B-LOC]"
2,['Barbara Plett reports from Baghdad.'],"[Barbara, Plett, Baghdad]","[Barbara, Plett, Baghdad]","[B-PERSON, I-PERSON, B-GPE]","[B-PERSON, I-PERSON, B-GPE]"
3,['Saddam Hussein addressed the nation in a spe...,"[Saddam, Hussein]","[Saddam, Hussein]","[B-PERSON, I-PERSON]","[B-PERSON, I-PERSON]"
4,"['Iraq has triumphed over its enemies, he said...",[Iraq],[Iraq],[B-GPE],[B-GPE]


In [211]:
df.Detected_ner_type.tolist()

[['B-NORP',
  'B-PERSON',
  'I-PERSON',
  'B-ORDINAL',
  'B-EVENT',
  'I-EVENT',
  'I-EVENT'],
 ['B-GPE', 'B-LOC'],
 ['B-PERSON', 'I-PERSON', 'B-GPE'],
 ['B-PERSON', 'I-PERSON'],
 ['B-GPE'],
 ['B-NORP', 'B-EVENT', 'I-EVENT', 'I-EVENT'],
 ['B-GPE'],
 [],
 [],
 ['B-PERSON', 'I-PERSON', 'B-ORG', 'I-ORG', 'I-ORG'],
 ['B-ORG', 'I-ORG'],
 ['B-PERSON', 'I-PERSON'],
 ['B-PERSON',
  'I-PERSON',
  'B-GPE',
  'B-DATE',
  'I-DATE',
  'B-NORP',
  'B-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART'],
 ['B-DATE',
  'B-PERSON',
  'B-GPE',
  'B-WORK_OF_ART',
  'B-DATE',
  'B-DATE',
  'I-DATE',
  'I-DATE'],
 ['B-PERSON',
  'B-DATE',
  'B-WORK_OF_ART',
  'I-WORK_OF_ART',
  'B-ORDINAL',
  'B-GPE'],
 ['B-PERSON', 'I-PERSON', 'B-PERSON', 'I-PERSON', 'B-PERSON'],
 ['B-ORDINAL'],
 ['B-GPE', 'I-GPE', 'I-GPE', 'B-DATE', 'I-DATE'],
 [],
 ['B-CARDINAL'],
 ['B-NORP'],
 [],
 [],
 ['B-DATE'],
 ['B-ORG', 'I-ORG', 'I-ORG'],
 ['B-CARDINAL',
  'B-EVENT',
  'I-EVENT',
  'I-EVENT',


In [213]:
df.Actual_ner_types.tolist()

[['B-NORP',
  'B-PERSON',
  'I-PERSON',
  'B-ORDINAL',
  'B-EVENT',
  'I-EVENT',
  'I-EVENT'],
 ['B-GPE', 'B-LOC'],
 ['B-PERSON', 'I-PERSON', 'B-GPE'],
 ['B-PERSON', 'I-PERSON'],
 ['B-GPE'],
 ['B-NORP', 'B-EVENT', 'I-EVENT', 'I-EVENT'],
 ['B-GPE'],
 [],
 [],
 ['B-PERSON', 'I-PERSON', 'B-ORG', 'I-ORG', 'B-GPE'],
 ['B-WORK_OF_ART', 'I-WORK_OF_ART'],
 ['B-PERSON', 'I-PERSON'],
 ['B-PERSON',
  'I-PERSON',
  'B-GPE',
  'B-DATE',
  'I-DATE',
  'B-NORP',
  'B-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART'],
 ['B-PERSON', 'B-GPE', 'B-WORK_OF_ART', 'B-DATE'],
 ['B-PERSON',
  'B-DATE',
  'B-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'B-GPE'],
 ['B-PERSON', 'I-PERSON', 'B-PERSON', 'I-PERSON', 'B-PERSON'],
 ['B-ORDINAL'],
 ['B-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'I-WORK_OF_ART',
  'B-GPE',
  'I-GPE',
  'I-GPE',
  'B-DATE',
  'I-DATE'],
 [],
 ['B-CARDINAL'],
 ['B-NORP'],
 [],
 [],
 ['B-DATE'],
 ['B-EVENT', 'I-

In [216]:
correct_counts_dict = {}
mismatch_counts_dict = {}
for f in flist:
    df = pd.read_csv(f,encoding="utf-8")
    df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
    df[['Detected_ner_type']] = df[['Detected_ner_type']].applymap(yaml.safe_load) 
    df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
    df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
    count_correct = 0
    mismatch_indices = []
    for i in range(df.shape[0]):
        if len(df.iloc[i,:].Actual_ner_types)==len(df.iloc[i,:].Detected_ner_type):
            count_correct += 1
        else:
            mismatch_indices.append(i)
    correct_counts_dict[f] = count_correct
    mismatch_counts_dict[f] = mismatch_indices

In [217]:
correct_counts_dict

{'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv': 1012,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv': 1243,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv': 1205,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv': 679,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner.csv': 1835,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner.csv': 580,
 '/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv': 679}

In [221]:
for k in mismatch_counts_dict.keys():
    print(k)
    print(len(mismatch_counts_dict[k]))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv
240
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv
83
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv
693
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv
244
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner.csv
194
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner.csv
199
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv
538


In [247]:
def multicount(searchlist,target):
    temp = []
    count = 0
    while(count<len(searchlist)):
        if searchlist[count].lower() == target.lower():
            temp.append(count)
        count += 1
    return temp

wrong_counts_dict = {}
missed_counts_dict = {}
correct_counts_dict = {}
for k in mismatch_counts_dict.keys():
    df = pd.read_csv(k,encoding="utf-8")
    df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
    df[['Detected_ner_type']] = df[['Detected_ner_type']].applymap(yaml.safe_load) 
    df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
    df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
    wrong_count = 0
    missed_count = 0
    correct_count = 0
    for ind in range(df.shape[0]):
        actual = df.iloc[ind,:].Actual_ners
        detect = df.iloc[ind,:].Detected_ners
        actual_type = df.iloc[ind,:].Actual_ner_types
        detect_type = df.iloc[ind,:].Detected_ner_type
        for i,a in enumerate(actual):
            if a in detect and detect.count(a) == 1:
                j = detect.index(a)
                if actual_type[i] == detect_type[j]:
                    correct_count += 1
                else:
                    wrong_count += 1
            elif a in detect and detect.count(a) > 1 and detect.count(a)==actual.count(a):
                temp = [detect_type[l] for l in multicount(detect,a)]
                if actual_type[i] in temp:
                    correct_count += 1
                else:
                    wrong_count += 1
            elif a in detect and detect.count(a) > 1 and detect.count(a)!=actual.count(a):
                temp = [detect_type[l] for l in multicount(detect,a)]
                if actual_type[i] in temp:
                    correct_count += 1
                elif detect.count(a) > actual.count(a):
                    wrong_count += 1
                elif detect.count(a) < actual.count(a):
                    missed_count += 1
            else:
                missed_count += 1
    wrong_counts_dict[k] = wrong_count
    missed_counts_dict[k] = missed_count
    correct_counts_dict[k] = correct_count     
            

In [248]:
for k in mismatch_counts_dict.keys():
    print(k)
    print(wrong_counts_dict[k])
    print(missed_counts_dict[k])
    print(correct_counts_dict[k])
    print("-----------------------------------------------------")

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv
179
684
2615
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv
65
133
406
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv
711
2977
5841
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv
141
650
1083
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner.csv
206
568
2501
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner.csv
129
549
1453
-----------------------------------------------------
/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv
0
0
0
-----------------------------------------------------


In [241]:
f = "/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner.csv"

In [249]:
df = pd.read_csv(f,encoding="utf-8")
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_type']] = df[['Detected_ner_type']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
    

In [250]:
df.shape

(1217, 5)

In [251]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_type
0,"['After Jesus said all these things, he left G...",[],"[Jesus, Galilee]",[],"[B-PERSON, B-GPE]"
1,['He went into the area of Judea on the other ...,[],"[Judea, the, Jordan, River]",[],"[B-GPE, B-LOC, I-LOC, I-LOC]"
2,['Many people followed him.'],[],[],[],[]
3,['Jesus healed the sick people there.'],[],[Jesus],[],[B-PERSON]
4,['Some Pharisees came to Jesus.'],[],"[Pharisees, Jesus]",[],"[B-NORP, B-PERSON]"
