In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start Spark Session with Spark NLP
# start() functions has two parameters: gpu and spark23
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
spark = sparknlp.start()

In [2]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 2.7.3
Apache Spark version: 2.4.4


In [3]:
import pathlib 
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.*")]
# create destination text file
for f in flist:
    filename = "test_" + pathlib.Path(f).name.split(".")[0] + ".csv"
    print(filename)

test_onto_nw_ner.csv
test_onto_bc_ner.csv
test_onto_wb_ner.csv
test_onto_mz_ner.csv
test_onto_pt_ner.csv
test_onto_bn_ner.csv
test_onto_tc_ner.csv


In [4]:
import csv
import pandas as pd
df = pd.read_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt",delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])

In [5]:
df.shape

(32488, 5)

In [6]:
df.columns

Index(['Word', 'POS', 'DEREP', 'TYPE', 'SENT_NO'], dtype='object')

In [7]:
df.head()

Unnamed: 0,Word,POS,DEREP,TYPE,SENT_NO
0,--,:,(TOP(S*,O,1
1,basically,RB,(ADVP*),O,1
2,",",",",*,O,1
3,it,PRP,(NP*),O,1
4,was,VBD,(VP*,O,1


In [8]:
embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \
      .setInputCols("sentence","token") \
      .setOutputCol("embeddings")

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [32]:
ner_onto = NerDLModel.pretrained("onto_bert_base_cased", "en") \
        .setInputCols(["document", "token", "embeddings"]) \
        .setOutputCol("ner")

onto_bert_base_cased download started this may take some time.
Approximate size to download 15.5 MB
[OK!]


In [10]:
token = ['basically',
 ',',
 'it',
 'was',
 'unanimously',
 'agreed',
 'upon',
 'by',
 'the',
 'various',
 'relevant',
 'parties',
 '.']

In [11]:
sentence = " ".join(token)

In [12]:
documentAssembler = DocumentAssembler().setInputCol("text")\
                     .setOutputCol("document")\
                     .setCleanupMode("shrink")

In [13]:
df1 = spark.createDataFrame(pd.DataFrame({'text':[sentence]}))
doc_df = documentAssembler.transform(df1)

In [14]:
doc_df.show()

+--------------------+--------------------+
|                text|            document|
+--------------------+--------------------+
|basically , it wa...|[[document, 0, 75...|
+--------------------+--------------------+



In [15]:
sentence_detector = SentenceDetector().setInputCols(["document"])\
                                      .setOutputCol("sentence")

In [16]:
sent_df = sentence_detector.transform(doc_df)

In [17]:
sent_df.show()

+--------------------+--------------------+--------------------+
|                text|            document|            sentence|
+--------------------+--------------------+--------------------+
|basically , it wa...|[[document, 0, 75...|[[document, 0, 75...|
+--------------------+--------------------+--------------------+



In [18]:
result = sent_df.select("text","sentence").collect()

In [19]:
result

[Row(text='basically , it was unanimously agreed upon by the various relevant parties .', sentence=[Row(annotatorType='document', begin=0, end=75, result='basically , it was unanimously agreed upon by the various relevant parties .', metadata={'sentence': '0'}, embeddings=[])])]

In [20]:
tokenizer = Tokenizer().setInputCols(["sentence"])\
                          .setOutputCol("token")

In [21]:
token_df = tokenizer.fit(sent_df).transform(sent_df)

In [22]:
embed_df = embeddings.transform(token_df)

In [23]:
embed_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|basically , it wa...|[[document, 0, 75...|[[document, 0, 75...|[[token, 0, 8, ba...|[[word_embeddings...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [24]:
result = embed_df.select("document","token","embeddings").collect()

In [25]:
detoken = []
for r in result[0].embeddings:
    detoken.append(r.result)

In [26]:
detoken

['basically',
 ',',
 'it',
 'was',
 'unanimously',
 'agreed',
 'upon',
 'by',
 'the',
 'various',
 'relevant',
 'parties',
 '.']

In [34]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings, ner_onto])

In [35]:
pipeline_model = nlp_pipeline.fit(spark.createDataFrame([['']]).toDF('text'))
result1 = pipeline_model.transform(df1)
                                                                     

In [37]:
result = result1.select("token","ner").collect()

In [41]:
detoken = []
for r in result[0].ner:
    detoken.append(r.metadata["word"])

In [42]:
detoken

['basically',
 ',',
 'it',
 'was',
 'unanimously',
 'agreed',
 'upon',
 'by',
 'the',
 'various',
 'relevant',
 'parties',
 '.']

In [44]:
import time
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.*")]
for f in flist:
    print("doing for " + f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])
    sentences = []
    entities = []
    entities_type = []
    print("there are {} sentences".format(len(df.groupby("SENT_NO").groups.items())))
    count = 1
    start = time.time()
    for _,v in df.groupby("SENT_NO").groups.items():
        temp1 = []
        temp2 = []
        temp3 = []
        if count%1000 == 0:
            print("Done for {} in {} seconds".format(count,time.time()-start))
        for i,t in enumerate(df.iloc[v,:].Word.tolist()):
            if i < (len(df.iloc[v,:].Word.tolist())-1):
                if df.iloc[v,:].Word.tolist()[i][0].isalnum() and not(df.iloc[v,:].Word.tolist()[i+1][0].isalnum()):
                    temp1.append(t + df.iloc[v,:].Word.tolist()[i+1])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i+1])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i+1])
                elif not(df.iloc[v,:].Word.tolist()[i][0].isalnum()):
                    continue
                else:
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
            elif i == (len(df.iloc[v,:].Word.tolist())-1):
                if t[0].isalnum():
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
        sentences.append(" ".join(temp3))
        entities_type.append(temp2)
        entities.append(temp3)
        count += 1
    sentences =  [s for s in sentences if s != ""]
    entities_type =  [s for s in entities_type if s != []]
    entities =  [s for s in entities if s != []]
    detected_sent = []
    detected_ner = []
    detected_ner_type = []
    start = time.time()
    for sent in sentences:
        df = spark.createDataFrame(pd.DataFrame({'text':[sent]}))
        result = pipeline_model.transform(df)    
        df_result = result.select("sentence","ner").collect()
    
        detected_sent.append([df_result[0][0][0].result])
        ners = []
        ner_types = []
        for d in df_result[0][1]:
            ners.append(d.metadata["word"])
            ner_types.append(d.result)
        detected_ner.append(ners)
        detected_ner_type.append(ner_types)
    print("Done processing in {} seconds".format(time.time()-start))
    pd.DataFrame({"Detected_sentence":detected_sent,"Actual_ners":entities,"Detected_ners":detected_ner,"Actual_ner_types":entities_type,"Detected_ner_types":detected_ner_type}).to_csv("detect_" + pathlib.Path(f).name.split(".")[0] + ".csv",index = False)   

doing for /Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt
there are 1898 sentences
Done for 1000 in 36.95173096656799 seconds
Done processing in 336.0318179130554 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt
there are 2037 sentences
Done for 1000 in 25.83370614051819 seconds
Done for 2000 in 47.08746099472046 seconds
Done processing in 317.4331738948822 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt
there are 929 sentences
Done processing in 153.64200711250305 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt
there are 780 sentences
Done processing in 129.082937002182 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt
there are 1217 sentences
Done for 1000 in 19.892353057861328 seconds
Done processing in 180.55652713775635 seconds
doing for /Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt
there are 1252 sentences
Done for 1000 in 28.17906093597412 seconds
Done processin

In [45]:
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist1 = [str(f) for f in path.rglob("*.*")]
path = pathlib.Path("/Users/ramybal/Desktop/untitled folder/neuroner/")
flist = [str(f) for f in path.rglob("detect*.csv")]

In [46]:
f = flist[0]

In [47]:
df = pd.read_csv(f)

In [48]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_types
0,['Iraqi leader Saddam Hussein has given a defi...,"['Iraqi', 'leader', 'Saddam', 'Hussein', 'has'...","['Iraqi', 'leader', 'Saddam', 'Hussein', 'has'...","['B-NORP', 'O', 'B-PERSON', 'I-PERSON', 'O', '...","['B-NORP', 'O', 'B-PERSON', 'I-PERSON', 'O', '..."
1,['He says Iraq has triumphed over the evil of ...,"['He', 'says', 'Iraq', 'has', 'triumphed', 'ov...","['He', 'says', 'Iraq', 'has', 'triumphed', 'ov...","['O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', '...","['O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', '..."
2,['Barbara Plett reports from Baghdad .'],"['Barbara', 'Plett', 'reports', 'from', 'Baghd...","['Barbara', 'Plett', 'reports', 'from', 'Baghd...","['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE', 'O']","['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE', 'O']"
3,['Saddam Hussein addressed the nation in a spe...,"['Saddam', 'Hussein', 'addressed', 'the', 'nat...","['Saddam', 'Hussein', 'addressed', 'the', 'nat...","['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', '...","['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', '..."
4,"['Iraq has triumphed over its enemies , he sai...","['Iraq', 'has', 'triumphed', 'over', 'its', 'e...","['Iraq', 'has', 'triumphed', 'over', 'its', 'e...","['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...","['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '..."


In [53]:
import yaml

df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [59]:
df.iloc[unequal[1],:].Actual_ner_types

['B-PERSON',
 'O',
 'B-DATE',
 'O',
 'B-WORK_OF_ART',
 'I-WORK_OF_ART',
 'I-WORK_OF_ART',
 'I-WORK_OF_ART',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GPE',
 'O']

In [60]:
df.iloc[unequal[1],:].Actual_ners

['Gao',
 "'s",
 '1989',
 'novel',
 '`',
 'Soul',
 'Mountain',
 "'",
 'will',
 'be',
 'published',
 'for',
 'the',
 'first',
 'time',
 'in',
 'America',
 '.']

In [61]:
df.iloc[unequal[1],:].Detected_ner_types

['B-PERSON',
 'O',
 'O',
 'B-DATE',
 'O',
 'O',
 'B-WORK_OF_ART',
 'I-WORK_OF_ART',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORDINAL',
 'O',
 'O',
 'B-GPE',
 'O']

In [62]:
df.iloc[unequal[1],:].Detected_ners

['Gao',
 "'",
 's',
 '1989',
 'novel',
 '`',
 'Soul',
 'Mountain',
 "'",
 'will',
 'be',
 'published',
 'for',
 'the',
 'first',
 'time',
 'in',
 'America',
 '.']

In [65]:
def multicount(searchlist,target):
    temp = []
    count = 0
    while(count<len(searchlist)):
        if searchlist[count].lower() == target.lower():
            temp.append(count)
        count += 1
    return temp
 
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and df.iloc[i,:].Detected_ners[j+1] == "s" and d == "'"):
            detect_ner_temp.append("'s")
            detect_type_temp.append("O")
            flag = 1
        elif d == "s" and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [66]:
len(detect_ner)

1252

In [67]:
len(detect_type)

1252

In [69]:
len(detect_ner[unequal[0]])

36

In [70]:
len(detect_type[unequal[0]])

36

In [72]:
len(df.iloc[unequal[0],:].Actual_ners)

36

In [73]:
df.Detected_ners = detect_ner

In [74]:
df.Detected_ner_types = detect_type

In [75]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [77]:
len(unequal)

128

In [78]:
df.iloc[unequal[0],:].Actual_ners

['Former', 'U.S.', 'representative', 'Sidney', 'Yates', 'has', 'died', '.']

In [79]:
df.iloc[unequal[0],:].Detected_ners

['Former', 'U.S', '.', 'representative', 'Sidney', 'Yates', 'has', 'died', '.']

In [80]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and df.iloc[i,:].Detected_ners[j+1] == "." and "." in d):
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [81]:
detect_ner[unequal[0]]

['Former', 'U.S.', 'representative', 'Sidney', 'Yates', 'has', 'died', '.']

In [82]:
detect_type[unequal[0]]

['O', 'B-GPE', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O']

In [83]:
df.Detected_ners = detect_ner

In [84]:
df.Detected_ner_types = detect_type

In [85]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [87]:
len(unequal)

103

In [88]:
df.iloc[unequal[0],:].Actual_ners

['Well',
 ',',
 'Mr.',
 'Mubarak',
 'has',
 'been',
 ',',
 'as',
 'you',
 'say',
 ',',
 'a',
 'voice',
 'of',
 'moderation',
 '.']

In [89]:
df.iloc[unequal[0],:].Detected_ners

['Well',
 ',',
 'Mr',
 '.',
 'Mubarak',
 'has',
 'been',
 ',',
 'as',
 'you',
 'say',
 ',',
 'a',
 'voice',
 'of',
 'moderation',
 '.']

In [116]:
count = 0
for i in unequal:
    if "'" in df.iloc[i,:].Detected_ners[:-1]:
        count += 1

In [117]:
count

67

In [118]:
count = 0
for i in unequal:
    if "." in df.iloc[i,:].Detected_ners[:-1]:
        count += 1

In [119]:
count

0

In [107]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [108]:
detect_ner[0]

['Iraqi',
 'leader',
 'Saddam',
 'Hussein',
 'has',
 'given',
 'a',
 'defiant',
 'speech',
 'to',
 'mark',
 'the',
 'tenth',
 'anniversary',
 'of',
 'the',
 'Gulf',
 'War',
 '.']

In [109]:
df.Detected_ners = detect_ner

In [110]:
df.Detected_ner_types = detect_type

In [120]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [121]:
df.Detected_ners = detect_ner

In [122]:
df.Detected_ner_types = detect_type

In [123]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [124]:
len(unequal)

12

In [127]:
df.iloc[unequal[1],:].Actual_ners

['This',
 'novel',
 '`',
 'soul',
 'mountain',
 "'",
 'which',
 'is',
 'just',
 'being',
 'released',
 'in',
 'the',
 'United',
 'States',
 'this',
 'week',
 'is',
 'in',
 'every',
 'possible',
 'way',
 'a',
 'celebration',
 'of',
 'what',
 'freedom',
 'means',
 '.']

In [128]:
df.iloc[unequal[1],:].Detected_ners

['This',
 'novel',
 '`',
 'soul',
 'mountain',
 "'which",
 'is',
 'just',
 'being',
 'released',
 'in',
 'the',
 'United',
 'States',
 'this',
 'week',
 'is',
 'in',
 'every',
 'possible',
 'way',
 'a',
 'celebration',
 'of',
 'what',
 'freedom',
 'means',
 '.']

In [129]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [130]:
df.Actual_ners = actual_ner

In [131]:
df.Actual_ner_types = actual_type

In [132]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [133]:
len(unequal)

2

In [135]:
df.iloc[unequal[0],:].Actual_ners

['Just', 'go', 'to', 'our', 'Website', ',', 'cnn.com/wolf', '.']

In [136]:
df.iloc[unequal[0],:].Detected_ners

['Just', 'go', 'to', 'our', 'Website', ',', 'cnn.com/wolf.']

In [137]:
df.iloc[unequal[0],:].Detected_ners = ['Just', 'go', 'to', 'our', 'Website', ',', 'cnn.com/wolf', '.']

In [138]:
df.iloc[unequal[0],:].Detected_ner_types

['O', 'O', 'O', 'O', 'O', 'O', 'O']

In [139]:
df.iloc[unequal[0],:].Actual_ner_types

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [140]:
df.iloc[unequal[0],:].Detected_ner_types = df.iloc[unequal[0],:].Actual_ner_types

In [141]:
df.iloc[unequal[1],:].Actual_ners

['Good', 'to', 'be', '-LRB-', 'bleep', '-RRB-', 'King', '.']

In [142]:
df.iloc[unequal[1],:].Detected_ners

['Good', 'to', 'be', '-', 'LRB', '-', 'bleep', '-', 'RRB', '-', 'King', '.']

In [143]:
df.iloc[unequal[1],:].Actual_ner_types

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [144]:
df.iloc[unequal[1],:].Detected_ner_types

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [146]:
df.iloc[unequal[1],:].Detected_ners = df.iloc[unequal[1],:].Actual_ners

In [147]:
df.iloc[unequal[1],:].Detected_ner_types = df.iloc[unequal[1],:].Actual_ner_types

In [148]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [149]:
unequal

[]

In [152]:
from seqeval.metrics import accuracy_score, f1_score

In [154]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner.csv
accuracy score
0.9795319112185854


In [155]:
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

0.8935976304397356


In [191]:
f = flist[1]

In [242]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [243]:
len(unequal)

340

In [244]:
df.iloc[unequal[10],:]

Detected_sentence        ["That 's true ."]
Actual_ners             [That, 's, true, .]
Detected_ners         [That, ', s, true, .]
Actual_ner_types               [O, O, O, O]
Detected_ner_types          [O, O, O, O, O]
Name: 47, dtype: object

In [245]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [246]:
df.Detected_ners = detect_ner

In [247]:
df.Detected_ner_types = detect_type

In [248]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [249]:
len(unequal)

42

In [250]:
df.iloc[unequal[3],:]

Detected_sentence        ["You know , I mean , we 're d- , you know ,"]
Actual_ners           [You, know, ,, I, mean, ,, we, 're, d-, ,, you...
Detected_ners         [You, know, ,, I, mean, ,, we, 're, d, -, ,, y...
Actual_ner_types                [O, O, O, O, O, O, O, O, O, O, O, O, O]
Detected_ner_types           [O, O, O, O, O, O, O, O, O, O, O, O, O, O]
Name: 107, dtype: object

In [251]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1)) and df.iloc[i,:].Detected_ners[j+1] == "-":
            detect_ner_temp.append(d + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "-" and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [252]:
df.Detected_ners = detect_ner

In [253]:
df.Detected_ner_types = detect_type

In [254]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1)) and df.iloc[i,:].Detected_ners[j+1] == ":":
            detect_ner_temp.append(d + ":")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == ":" and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [255]:
df.Detected_ners = detect_ner

In [256]:
df.Detected_ner_types = detect_type

In [257]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [258]:
len(unequal)

6

In [259]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['My', 'daughter', '-', 'in', '-', 'law', 'Barbara', 'got', 'on', 'her', 'case', 'real', 'serious', 'yesterday', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O']
detected
['My', 'daughter-', 'in-', 'law', 'Barbara', 'got', 'on', 'her', 'case', 'real', 'serious', 'yesterday', '.']
['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O']
-----------------------------------------------------------
actual
['I', 'truly', 'believe', 'that', '**Salem**']
['O', 'O', 'O', 'O', 'O']
detected
['I', 'truly', 'believe', 'that', '**', 'Salem', '**']
['O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O']
-----------------------------------------------------------
actual
['because', 'my', 'daughter', '-', 'in', '-', 'law', 'is', 'pregnant', 'again', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['because', 'my', 'daughter-', 'in-', 'law', 'is', 'pregnant', 'again', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
---

In [260]:
df.iloc[unequal[0],:].Detected_ners = df.iloc[unequal[0],:].Actual_ners
df.iloc[unequal[0],:].Detected_ner_types = df.iloc[unequal[0],:].Actual_ner_types

In [261]:
df.iloc[unequal[1],:].Detected_ners = df.iloc[unequal[1],:].Actual_ners
df.iloc[unequal[1],:].Detected_ner_types = ['O', 'O', 'O', 'O', 'B-PERSON']

In [262]:
for i in unequal[2:]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [263]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [264]:
len(unequal)

0

In [265]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner.csv
accuracy score
0.9811835544265688


In [266]:
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

0.7783783783783784


In [267]:
f = flist[2]

In [268]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [269]:
len(unequal)

757

In [270]:
for i in unequal[:4]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['With', 'economic', 'tension', 'between', 'the', 'U.S.', 'and', 'Japan', 'worsening', ',', 'many', 'Japanese', 'had', 'feared', 'last', 'week', "'s", 'visit', 'from', 'U.S.', 'Trade', 'Representative', 'Carla', 'Hills', '.']
['O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'B-GPE', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O']
detected
['With', 'economic', 'tension', 'between', 'the', 'U.S', '.', 'and', 'Japan', 'worsening', ',', 'many', 'Japanese', 'had', 'feared', 'last', 'week', "'", 's', 'visit', 'from', 'U.S', '.', 'Trade', 'Representative', 'Carla', 'Hills', '.']
['O', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O']
-----------------------------------------------------------
actual
['They', 'expected', 'a', 'new', 'barrage', 'of', 'demands', 'that', 'Japan', 'do', 'somet

In [272]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [273]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [274]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [275]:
len(unequal)

516

In [276]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [277]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [278]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [279]:
len(unequal)

134

In [280]:
for i in unequal[:4]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Mrs.', 'Hills', "'", 'first', 'trip', 'to', 'Japan', 'as', 'America', "'s", 'chief', 'trade', 'negotiator', 'had', 'a', 'completely', 'different', 'tone', 'from', 'last', 'month', "'s", 'visit', 'by', 'Commerce', 'Secretary', 'Robert', 'A.', 'Mosbacher', '.']
['O', 'B-PERSON', 'O', 'B-ORDINAL', 'O', 'O', 'B-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'B-ORG', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O']
detected
['Mrs.', 'Hills', "'first", 'trip', 'to', 'Japan', 'as', 'America', "'s", 'chief', 'trade', 'negotiator', 'had', 'a', 'completely', 'different', 'tone', 'from', 'last', 'month', "'s", 'visit', 'by', 'Commerce', 'Secretary', 'Robert', 'A.', 'Mosbacher', '.']
['O', 'B-PERSON', 'O', 'O', 'O', 'B-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'B-ORG', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O']
-------------------------------------------------

In [281]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [282]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [283]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [284]:
len(unequal)

105

In [286]:
for i in unequal[20:24]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['3', '.', 'That', 'his', '``', 'committee', 'does', 'not', 'deal', 'with', 'any', 'possible', 'criminal', 'activity', 'at', 'HUD', '.']
['B-CARDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']
detected
['3.', 'That', 'his', '``', 'committee', 'does', 'not', 'deal', 'with', 'any', 'possible', 'criminal', 'activity', 'at', 'HUD', '.']
['B-CARDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']
-----------------------------------------------------------
actual
['My', 'colleagues', 'and', 'I', 'fully', 'realize', 'we', 'are', 'not', 'a', 'court', '...', 'etc', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['My', 'colleagues', 'and', 'I', 'fully', 'realize', 'we', 'are', 'not', 'a', 'court.', '..', '..', 'etc', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['4', '.', 'T

In [288]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [289]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [290]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [291]:
len(unequal)

93

In [292]:
for i in unequal[20:24]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Baxter', 'v.', 'Palmingiano', ',', '425', 'U.S.', '308', '-LRB-', '1976', '-RRB-']
['B-PERSON', 'O', 'B-PERSON', 'O', 'B-LAW', 'I-LAW', 'I-LAW', 'O', 'B-DATE', 'O']
detected
['Baxter', 'v.', 'Palmingiano', ',', '425', 'U.S.', '308', '-', 'LRB', '-', '1976', '-', 'RRB', '-']
['B-PERSON', 'O', 'B-ORG', 'O', 'B-CARDINAL', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Or', 'so', 'it', 'must', 'seem', 'to', 'Jackie', 'Mason', ',', 'the', 'veteran', 'Jewish', 'comedian', 'appearing', 'in', 'a', 'new', 'ABC', 'sitcom', 'airing', 'on', 'Tuesday', 'nights', '-LRB-', '9:30', '-', '10', 'p.m.', 'EDT', '-RRB-']
['O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-DATE', 'B-TIME', 'O', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'O']
detected
['Or', 'so', 'it', 'must', 'seem', 'to', 'Jackie', 'Mason', ',', 'the', 'veteran', 'Jewish', 'comedian', 'ap

In [293]:
df.shape

(1898, 5)

In [294]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LRB","RRB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [295]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [296]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [297]:
len(unequal)

15

In [298]:
for i in unequal[:4]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment', '...', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment.', '..', '..', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['I', 'think', 'the', 'resurge

In [299]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LCB","RCB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [300]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [301]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [302]:
len(unequal)

8

In [304]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment', '...', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment.', '..', '..', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['My', 'colleagues', 'and', 'I

In [305]:
for i in unequal:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [306]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))


/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner.csv
accuracy score
0.9760026366201825


In [307]:
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

0.8955699564432167


In [407]:
f = flist[3]

In [483]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [484]:
df.shape

(923, 5)

In [485]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [486]:
len(unequal)

250

In [487]:
for i in unequal[20:24]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['The', 'hot', 'dialogue', '...', 'that', 'went', 'on', 'between', '-LSB-', 'Saddam', 'Hussein', 'and', 'Rumsfeld', '-RSB-', 'at', 'the', 'airport', 'prison', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['The', 'hot', 'dialogue', '.', '.', '.', 'that', 'went', 'on', 'between', '-', 'LSB', '-', 'Saddam', 'Hussein', 'and', 'Rumsfeld', '-', 'RSB', '-', 'at', 'the', 'airport', 'prison', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Details', 'of', 'the', 'minutes', 'of', 'the', 'meeting', 'between', 'Rumsfeld', 'and', 'Saddam', 'Hussein', 'at', 'the', 'Airport', 'Prison', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-FAC', 'I-FAC', 'I-FAC', 'O']
detected
['Details', 'of', 'th

In [488]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LRB","RRB","LCB","RCB","LSB","RSB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [489]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [490]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [491]:
len(unequal)

223

In [492]:
for i in unequal[20:24]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Details', 'of', 'the', 'minutes', 'of', 'the', 'meeting', 'between', 'Rumsfeld', 'and', 'Saddam', 'Hussein', 'at', 'the', 'Airport', 'Prison', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-FAC', 'I-FAC', 'I-FAC', 'O']
detected
['Details', 'of', 'the', 'minutes', 'of', 'the', 'meeting', 'between', 'Rumsfeld', 'and', 'Saddam', 'Hussein', 'at', 'the', 'Airport', 'Prison', '.', '.', '.']
['O', 'O', 'B-TIME', 'I-TIME', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-FAC', 'I-FAC', 'I-FAC', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Rumsfeld', '-LRB-', 'trying', 'to', 'suppress', 'his', 'anger', '-RRB-', 'What', "'s", 'past', 'is', 'past', ',', 'I', 'came', 'especially', 'to', 'make', 'you', 'a', 'clear', 'and', 'specific', 'offer', 'and', 'I', 'want', 'to', 'hear', 'from', 'you', 'a', 'clear', 'and', 'definite', 'answer', '.']
['B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', '

In [493]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [494]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [495]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [496]:
len(unequal)

109

In [497]:
for i in unequal[20:24]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Mr', 'Rumsfeld', ',', 'you', 'have', 'committed', 'the', 'greatest', 'crime', 'in', 'history', 'against', 'a', 'peaceful', 'Arab', 'state', '..', 'we', 'met', 'each', 'other', 'in', 'the', 'eighties', '.']
['O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O']
detected
['Mr', 'Rumsfeld', ',', 'you', 'have', 'committed', 'the', 'greatest', 'crime', 'in', 'history', 'against', 'a', 'peaceful', 'Arab', 'state', '.', '.', 'we', 'met', 'each', 'other', 'in', 'the', 'eighties', '.']
['O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O']
-----------------------------------------------------------
actual
['No', 'Mr.', 'Rumsfeld', '..', 'do', "n't", 'forget', 'that', 'you', 'are', 'speaking', 'with', 'Saddam', 'Hussein', ',', 'the', 'president', 'of', 'the', 'state', 'of', 'Iraq', '.']
['O', 'O', '

In [499]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [500]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [501]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [502]:
len(unequal)

80

In [503]:
for i in unequal[:20]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [504]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [505]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [506]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [507]:
len(unequal)

72

In [508]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [509]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [510]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [511]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [512]:
len(unequal)

54

In [513]:
import re
re.findall(r'^[.][.]+',".")

[]

In [514]:
def multicount_re(searchlist):
    temp = []
    for i,s in enumerate(searchlist):
        if len(re.findall(r'^[.][.]+',s)):
            temp.append(i)
    return temp

detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    ref_list = multicount_re(df.iloc[i,:].Detected_ners)
    detect_ner_temp = []
    detect_type_temp = []
    if len(ref_list) > 1:
        counter = 0
        while (counter<len(df.iloc[i,:].Detected_ners)):
            flag = 0
            while((counter<len(df.iloc[i,:].Detected_ners) - 1) and counter in ref_list):            
                counter += 1
                flag += 1
            if flag > 0:
                detect_ner_temp.append("...")
                detect_type_temp.append("O")
            while((counter<len(df.iloc[i,:].Detected_ners) - 1) and counter not in ref_list):
                detect_ner_temp.append(df.iloc[i,:].Detected_ners[counter])
                detect_type_temp.append(df.iloc[i,:].Detected_ner_types[counter])
                if (counter<len(df.iloc[i,:].Detected_ners) - 1):
                    counter += 1
            if counter == (len(df.iloc[i,:].Detected_ners) - 1):
                detect_ner_temp.append(df.iloc[i,:].Detected_ners[counter])
                detect_type_temp.append(df.iloc[i,:].Detected_ner_types[counter])
                detect_ner.append(detect_ner_temp)
                detect_type.append(detect_type_temp)
                break
    else:
        detect_ner.append(df.iloc[i,:].Detected_ners)
        detect_type.append(df.iloc[i,:].Detected_ner_types)

In [515]:
len(detect_ner)

923

In [516]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [517]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [518]:
len(unequal)

41

In [519]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [520]:
for i in unequal[:10]:
    if i == unequal[2]:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types
    else:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [521]:
for i in unequal[10:20]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['For', 'the', 'people', 'working', 'at', 'Bahrain', "'s", 'malls', ',', 'the', 'person', 'covered', 'head', 'to', 'toe', 'in', 'a', 'black', 'veil', ',', 'gloves', 'and', 'glasses', 'appeared', 'to', 'be', 'a', 'rich', ',', 'doting', 'Saudi', 'mother', '....']
['O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O']
detected
['For', 'the', 'people', 'working', 'at', 'Bahrain', "'s", 'malls', ',', 'the', 'person', 'covered', 'head', 'to', 'toe', 'in', 'a', 'black', 'veil', ',', 'gloves', 'and', 'glasses', 'appeared', 'to', 'be', 'a', 'rich', ',', 'doting', 'Saudi', 'mother.', '...', '.']
['O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['This', 'day', 'is', 'named', 'f

In [522]:
for i in unequal[10:20]:   
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [523]:
for i in unequal[20:30]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['The', 'pop', 'star', ',', 'who', 'is', 'said', 'to', 'be', '$', '240', 'million', 'in', 'debt', ',', 'had', 'paid', 'six', 'figures', 'for', 'a', 'ritual', 'cleansing', 'using', 'sheep', 'blood', 'to', 'another', 'voodoo', 'doctor', 'and', 'a', 'mysterious', 'Egyptian', 'woman', 'named', 'Samia', ',', 'who', 'came', 'to', 'him', 'with', 'a', 'letter', 'of', 'greeting', 'from', 'a', 'high', '-', 'ranking', 'Saudi', 'prince', ',', 'purportedly', 'Nawaf', 'Bin', 'Abdulaziz', 'Al', '-', 'Saud', ',', 'now', 'the', 'chief', 'of', 'intelligence', 'of', 'Saudi', 'Arabia', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MONEY', 'I-MONEY', 'I-MONEY', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 

In [524]:
for i in unequal[20:30]:
    if i == unequal[26] or i == unequal[29]:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types
    else:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [525]:
for i in unequal[30:40]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['The', 'sentence', 'I', 'put', 'in', 'boldface', 'is', '*extremely*', 'interesting', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['The', 'sentence', 'I', 'put', 'in', 'boldface', 'is', '*', 'extremely', '*', 'interesting', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['hmmm', '...']
['O', 'O']
detected
['hmmm.', '..', '.']
['O', 'O', 'O']
-----------------------------------------------------------
actual
['in', 'March', '2003', 'Jackson', 'distraction', '-', 'o', '-', 'mania', 'was', 'part', 'of', 'the', 'media', 'smokescreen', 'that', 'led', 'up', 'the', 'Iraq', 'war', '...']
['O', 'B-DATE', 'I-DATE', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EVENT', 'I-EVENT', 'I-EVENT', 'O']
detected
['in', 'March', '2003', 'Jackson', 'distraction', '-', 'o', '-', 'mania', 'was', 'part', 'of', 'the', 'media', 'smokescreen', 'that', 'led', 'up

In [526]:
for i in unequal[30:40]:
    if i == unequal[30]:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types
    elif i == unequal[39]:
        df.iloc[i,:].Detected_ners = ['MSN', ':', 'p', '...', '@hotmail.com']
        df.iloc[i,:].Detected_ner_types = ['B-ORG', 'O', 'O']
    else:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [528]:
i = unequal[40]
print("actual")
print(df.iloc[i,:].Actual_ners)
print(df.iloc[i,:].Actual_ner_types)
print("detected")
print(df.iloc[i,:].Detected_ners)
print(df.iloc[i,:].Detected_ner_types)
print("-----------------------------------------------------------")

actual
['email', ':', 'youyutianshi...@sina.com']
['O', 'O', 'O']
detected
['email', ':', 'youyutianshi.', '...', '@sina.com']
['O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------


In [529]:
df.iloc[i,:].Detected_ners = ['email', ':', 'youyutianshi...@sina.com']
df.iloc[i,:].Detected_ner_types = ['O', 'O', 'O']

In [530]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [531]:
len(unequal)

0

In [532]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner.csv
accuracy score
0.9695007758574563


In [533]:
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))


0.7908410391897842
